コード例 #1
0
    def test_predict_proba_binary_classification(self):
        self.output_dir = os.path.join(
            os.getcwd(), '.test_predict_proba_binary_classification')
        D = get_binary_classification_datamanager()

        class Dummy2(object):
            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9]] * 23)

            def fit(self, X, y):
                return self

        model = Dummy2()

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, self.output_dir, configuration)
        evaluator.model = model
        loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
            evaluator.fit_predict_and_loss()

        for i in range(23):
            self.assertEqual(0.9, Y_optimization_pred[i][1])
コード例 #2
0
    def test_with_abalone(self):
        dataset = 'abalone'
        dataset_path = os.path.join(os.path.dirname(__file__), '.datasets',
                                    dataset)
        D = CompetitionDataManager(dataset_path)
        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        errors = []
        for i in range(N_TEST_RUNS):
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_, configuration,
                                          inner_cv_folds=2,
                                          outer_cv_folds=2)
            if not self._fit(evaluator):
                continue
            err = evaluator.predict()
            self.assertLess(err, 0.99)
            self.assertTrue(np.isfinite(err))
            errors.append(err)
        # This is a reasonable bound
        self.assertEqual(10, len(errors))
        self.assertLess(min(errors), 0.77)
コード例 #3
0
    def _create_search_space(self,
                             tmp_dir,
                             backend,
                             datamanager,
                             include_estimators=None,
                             exclude_estimators=None,
                             include_preprocessors=None,
                             exclude_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            exclude_estimators=exclude_estimators,
            include_preprocessors=include_preprocessors,
            exclude_preprocessors=exclude_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path
コード例 #4
0
    def test_with_abalone(self):
        dataset = 'abalone'
        dataset_path = os.path.join(os.path.dirname(__file__), '.datasets',
                                    dataset)
        D = CompetitionDataManager(dataset_path)
        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        errors = []
        for i in range(N_TEST_RUNS):
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_,
                                          configuration,
                                          inner_cv_folds=2,
                                          outer_cv_folds=2)
            if not self._fit(evaluator):
                continue
            err = evaluator.predict()
            self.assertLess(err, 0.99)
            self.assertTrue(np.isfinite(err))
            errors.append(err)
        # This is a reasonable bound
        self.assertEqual(10, len(errors))
        self.assertLess(min(errors), 0.77)
コード例 #5
0
    def test_predict_proba_binary_classification(self):
        self.output_dir = os.path.join(os.getcwd(),
                                       '.test_predict_proba_binary_classification')
        D = get_binary_classification_datamanager()

        class Dummy2(object):

            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9]] * 23)

            def fit(self, X, y):
                return self

        model = Dummy2()

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, self.output_dir, configuration)
        evaluator.model = model
        loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
            evaluator.fit_predict_and_loss()

        for i in range(23):
            self.assertEqual(0.9, Y_optimization_pred[i][1])
コード例 #6
0
    def test_file_output(self):
        self.output_dir = os.path.join(os.getcwd(), '.test_file_output')

        D = get_regression_datamanager()
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        configuration = configuration_space.sample_configuration()
        backend_api = backend.create(self.output_dir, self.output_dir)
        evaluator = HoldoutEvaluator(D,
                                     backend_api,
                                     configuration,
                                     with_predictions=True,
                                     all_scoring_functions=True,
                                     output_y_test=True)

        loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
            evaluator.fit_predict_and_loss()
        evaluator.file_output(loss, Y_optimization_pred, Y_valid_pred,
                              Y_test_pred)

        self.assertTrue(
            os.path.exists(
                os.path.join(self.output_dir, '.auto-sklearn',
                             'true_targets_ensemble.npy')))
コード例 #7
0
def main():
    parser = ArgumentParser()

    parser.add_argument("--working-directory", type=str, required=True)
    parser.add_argument("--cutoff", type=int, default=-1)
    parser.add_argument("--only-best", type=bool, default=True)

    args = parser.parse_args()
    working_directory = args.working_directory
    cutoff = args.cutoff
    only_best = args.only_best

    for task_type in ('classification', 'regression'):
        if task_type == 'classification':
            metadata_sets = itertools.product(
                [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION],
                CLASSIFICATION_METRICS)
            input_directory = os.path.join(working_directory, 'configuration',
                                           'classification')
        elif task_type == 'regression':
            metadata_sets = itertools.product(
                [0, 1], [REGRESSION], REGRESSION_METRICS)
            input_directory = os.path.join(working_directory, 'configuration',
                                           'regression')
        else:
            raise ValueError(task_type)

        output_dir = os.path.join(working_directory, 'configuration_results')

        for sparse, task, metric in metadata_sets:
            print(TASK_TYPES_TO_STRING[task], metric, sparse)

            output_dir_ = os.path.join(output_dir, '%s_%s_%s' % (
                metric, TASK_TYPES_TO_STRING[task],
                'sparse' if sparse else 'dense'))

            configuration_space = pipeline.get_configuration_space(
                {'is_sparse': sparse, 'task': task})

            outputs, configurations = retrieve_matadata(
                validation_directory=input_directory,
                metric=metric,
                cutoff=cutoff,
                configuration_space=configuration_space,
                only_best=only_best)

            if len(outputs) == 0:
                print("No output found for %s, %s, %s" %
                      (metric, TASK_TYPES_TO_STRING[task],
                       'sparse' if sparse else 'dense'))
                continue

            try:
                os.makedirs(output_dir_)
            except:
                pass

            write_output(outputs, configurations, output_dir_,
                         configuration_space, metric)
コード例 #8
0
 def setUp(self):
     self.queue = multiprocessing.Queue()
     self.configuration = get_configuration_space(
         {'task': MULTICLASS_CLASSIFICATION,
          'is_sparse': False}).get_default_configuration()
     self.data = get_multiclass_classification_datamanager()
     self.tmp_dir = os.path.join(os.path.dirname(__file__),
                                 '.test_cv_functions')
コード例 #9
0
 def setUp(self):
     self.queue = multiprocessing.Queue()
     self.configuration = get_configuration_space({
         'task': MULTICLASS_CLASSIFICATION,
         'is_sparse': False
     }).get_default_configuration()
     self.data = get_multiclass_classification_datamanager()
     self.tmp_dir = os.path.join(os.path.dirname(__file__),
                                 '.test_holdout_functions')
コード例 #10
0
 def setUp(self):
     self.queue = multiprocessing.Queue()
     self.configuration = get_configuration_space(
         {'task': MULTICLASS_CLASSIFICATION,
          'is_sparse': False}).get_default_configuration()
     self.data = get_multiclass_classification_datamanager()
     self.tmp_dir = os.path.join(os.path.dirname(__file__),
                                 '.test_cv_functions')
     self.backend = unittest.mock.Mock(spec=Backend)
     self.backend.load_datamanager.return_value = self.data
     self.dataset_name = json.dumps({'task_id': 'test'})
コード例 #11
0
def main():
    parser = ArgumentParser()

    parser.add_argument("configuration_directory",
                        metavar="configuration-directory")
    parser.add_argument("output_directory", metavar="output-directory")
    parser.add_argument("--cutoff", type=int, default=-1,
                        help="Only consider the validation performances up to "
                             "this time.")
    parser.add_argument("--num-runs", type=int, default=1)
    parser.add_argument("--only-best", type=bool, default=False,
                        help="Look only for the best configuration in the "
                             "validation files.")

    args = parser.parse_args()
    configuration_directory = args.configuration_directory
    output_dir = args.output_directory
    cutoff = int(args.cutoff)
    num_runs = args.num_runs

    for sparse, task in [(1, BINARY_CLASSIFICATION),
                         (1, MULTICLASS_CLASSIFICATION),
                         (0, BINARY_CLASSIFICATION),
                         (0, MULTICLASS_CLASSIFICATION)]:

        for metric in ['acc_metric', 'auc_metric', 'bac_metric', 'f1_metric',
                       'pac_metric']:

            output_dir_ = os.path.join(output_dir, '%s_%s_%s' % (
                metric, TASK_TYPES_TO_STRING[task], 'sparse' if sparse else 'dense'))

            configuration_space = pipeline.get_configuration_space(
                {'is_sparse': sparse, 'task': task}
            )

            try:
                os.makedirs(output_dir_)
            except:
                pass

            outputs, configurations = retrieve_matadata(
                validation_directory=configuration_directory,
                num_runs=num_runs,
                metric=metric,
                cutoff=cutoff,
                configuration_space=configuration_space,
                only_best=args.only_best)

            if len(outputs) == 0:
                raise ValueError("Nothing found!")

            write_output(outputs, configurations, output_dir_,
                         configuration_space, metric)
コード例 #12
0
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), '.test')

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('boston')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D,
                                         configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(
                os.path.exists(
                    os.path.join(output_dir, '.auto-sklearn',
                                 'true_targets_ensemble.npy')))
            break
コード例 #13
0
    def test_metalearning(self):
        dataset_name = 'digits'

        initial_challengers = {
            ACC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            AUC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'",
            BAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            F1_METRIC: "--initial-challengers \" "
                       "-balancing:strategy 'weighting' "
                       "-classifier:__choice__ 'proj_logit'",
            PAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'"
        }

        for metric in initial_challengers:
            configuration_space = get_configuration_space(
                {
                    'metric': metric,
                    'task': MULTICLASS_CLASSIFICATION,
                    'is_sparse': False
                },
                include_preprocessors=['no_preprocessing'])

            X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
            categorical = [False] * X_train.shape[1]

            meta_features_label = calc_meta_features(X_train, Y_train,
                                                     categorical, dataset_name)
            meta_features_encoded_label = calc_meta_features_encoded(X_train,
                                                                     Y_train,
                                                                     categorical,
                                                                     dataset_name)
            initial_configuration_strings_for_smac = \
                create_metalearning_string_for_smac_call(
                    meta_features_label,
                    meta_features_encoded_label,
                    configuration_space, dataset_name, metric,
                    MULTICLASS_CLASSIFICATION, False, 1, None)

            print(metric)
            print(initial_configuration_strings_for_smac[0])
            self.assertTrue(initial_configuration_strings_for_smac[
                                0].startswith(initial_challengers[metric]))
コード例 #14
0
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):

            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': task_type,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
コード例 #15
0
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': AUC_METRIC,
            'task': BINARY_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 2
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
コード例 #16
0
ファイル: autosklearn.py プロジェクト: zhongbinEDEN/mosaic_ml
def get_autosklearn_metalearning(X_train, y_train, cat, metric,
                                 num_initial_configurations):
    task_id = "new_task"
    is_sparse = scipy.sparse.issparse(X_train)

    dataset_properties = {
        'signed': True,
        'multiclass': False if len(np.unique(y_train)) == 2 == 2 else True,
        'task': 1 if len(np.unique(y_train)) == 2 else 2,
        'sparse': is_sparse,
        'is_sparse': is_sparse,
        'target_type': 'classification',
        'multilabel': False
    }

    config_space = pipeline.get_configuration_space(dataset_properties, None,
                                                    None, None, None)

    metalearning_dir = os.path.join(
        os.path.dirname(metalearning.__file__), "files",
        "balanced_accuracy_{0}.classification_{1}".format(
            "multiclass" if dataset_properties["multiclass"] else "binary",
            "sparse" if dataset_properties["sparse"] else "dense"))
    metabase = MetaBase(config_space, metalearning_dir)

    meta_features = None
    try:
        rvals, sparse = perform_one_hot_encoding(
            dataset_properties["sparse"], [c in ['categorical'] for c in cat],
            [X_train])
        meta_features = _calculate_metafeatures_encoded__(
            task_id, rvals[0], y_train)
        X_train = rvals
    except:
        meta_features = _calculate_metafeatures__(cat,
                                                  MULTICLASS_CLASSIFICATION,
                                                  task_id, X_train, y_train)

    if meta_features is None:
        raise Exception("Error calculating metafeatures")

    metabase.add_dataset(task_id, meta_features)

    configs, list_nn = (suggest_via_metalearning_(
        metabase, task_id, metric,
        2 if dataset_properties["multiclass"] else 1, False,
        num_initial_configurations))

    return configs, list_nn
コード例 #17
0
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):
            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': task_type,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
コード例 #18
0
ファイル: automl.py プロジェクト: wanjinchang/auto-sklearn
def _create_search_space(tmp_dir, data_info, backend, watcher, logger,
                         include_estimators=None, include_preprocessors=None):
    task_name = 'CreateConfigSpace'
    watcher.start_task(task_name)
    configspace_path = os.path.join(tmp_dir, 'space.pcs')
    configuration_space = pipeline.get_configuration_space(
        data_info,
        include_estimators=include_estimators,
        include_preprocessors=include_preprocessors)
    sp_string = pcs_parser.write(configuration_space)
    backend.write_txt_file(configspace_path, sp_string,
                           'Configuration space')
    watcher.stop_task(task_name)

    return configuration_space, configspace_path
コード例 #19
0
 def setUp(self):
     self.queue = multiprocessing.Queue()
     self.configuration = get_configuration_space({
         'task': MULTICLASS_CLASSIFICATION,
         'is_sparse': False
     }).get_default_configuration()
     self.data = get_multiclass_classification_datamanager()
     self.tmp_dir = os.path.join(os.path.dirname(__file__),
                                 '.test_holdout_functions')
     self.n = len(self.data.data['Y_train'])
     self.y = self.data.data['Y_train'].flatten()
     self.backend = unittest.mock.Mock()
     self.backend.get_model_dir.return_value = 'udiaetzrpduaeirdaetr'
     self.backend.output_directory = 'duapdbaetpdbe'
     self.dataset_name = json.dumps({'task_id': 'test'})
コード例 #20
0
ファイル: automl.py プロジェクト: stokasto/auto-sklearn
def _create_search_space(tmp_dir, data_info, backend, watcher, logger,
                         include_estimators=None, include_preprocessors=None):
    task_name = 'CreateConfigSpace'
    watcher.start_task(task_name)
    configspace_path = os.path.join(tmp_dir, 'space.pcs')
    configuration_space = pipeline.get_configuration_space(
        data_info,
        include_estimators=include_estimators,
        include_preprocessors=include_preprocessors)
    sp_string = pcs_parser.write(configuration_space)
    backend.write_txt_file(configspace_path, sp_string,
                           'Configuration space')
    watcher.stop_task(task_name)

    return configuration_space, configspace_path
コード例 #21
0
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), '.test')

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('boston')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D, configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(
                output_dir, '.auto-sklearn', 'true_targets_ensemble.npy')))
            break
コード例 #22
0
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': AUC_METRIC,
            'task': BINARY_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 2
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info, include_estimators=['lda'], include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
コード例 #23
0
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration,
                                         all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)
コード例 #24
0
    def test_evaluate_multilabel_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        Y_train = np.array(convert_to_bin(Y_train, 3))
        Y_train[:, -1] = 1
        Y_test = np.array(convert_to_bin(Y_test, 3))
        Y_test[:, -1] = 1

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': F1_METRIC,
            'task': MULTILABEL_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            print(err[i])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
コード例 #25
0
def get_meta_learning_configs(X,
                              y,
                              task_type,
                              dataset_name='default',
                              metric='accuracy',
                              num_cfgs=5):
    if X is None or y is None:
        X, y, _ = load_data(dataset_name)
    backend = create(temporary_directory=None,
                     output_directory=None,
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)
    dm = XYDataManager(X, y, None, None, task_type, None, dataset_name)

    configuration_space = pipeline.get_configuration_space(
        dm.info,
        include_estimators=None,
        exclude_estimators=None,
        include_preprocessors=None,
        exclude_preprocessors=None)

    watcher = StopWatch()
    name = os.path.basename(dm.name)
    watcher.start_task(name)

    def reset_data_manager(max_mem=None):
        pass

    automlsmbo = AutoMLSMBO(
        config_space=configuration_space,
        dataset_name=dataset_name,
        backend=backend,
        total_walltime_limit=1e5,
        func_eval_time_limit=1e5,
        memory_limit=1e5,
        metric=metric,
        watcher=watcher,
        metadata_directory='components/meta_learning/meta_resource',
        num_metalearning_cfgs=num_cfgs)
    automlsmbo.reset_data_manager = reset_data_manager
    automlsmbo.task = task_type
    automlsmbo.datamanager = dm
    configs = automlsmbo.get_metalearning_suggestions()
    return configs
コード例 #26
0
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info, include_estimators=['lda'], include_preprocessors=['pca'])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_,
                                         configuration,
                                         all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)
コード例 #27
0
    def test_evaluate_multilabel_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        Y_train = np.array(convert_to_bin(Y_train, 3))
        Y_train[:, -1] = 1
        Y_test = np.array(convert_to_bin(Y_test, 3))
        Y_test[:, -1] = 1

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': F1_METRIC,
            'task': MULTILABEL_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            print(err[i])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
コード例 #28
0
    def test_evaluate_regression(self):
        X_train, Y_train, X_test, Y_test = get_dataset('boston')

        X_valid = X_test[:200, ]
        Y_valid = Y_test[:200, ]
        X_test = X_test[200:, ]
        Y_test = Y_test[200:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = [
            'numerical', 'Numerical', 'numerical', 'numerical', 'numerical',
            'numerical', 'numerical', 'numerical', 'numerical', 'numerical',
            'numerical'
        ]

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
コード例 #29
0
    def test_evaluate_regression(self):
        X_train, Y_train, X_test, Y_test = get_dataset('boston')

        X_valid = X_test[:200, ]
        Y_valid = Y_test[:200, ]
        X_test = X_test[200:, ]
        Y_test = Y_test[200:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical',
                       'numerical', 'numerical', 'numerical', 'numerical',
                       'numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
コード例 #30
0
ファイル: automl.py プロジェクト: Ayaro/auto-sklearn
    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path
コード例 #31
0
    def test_file_output(self):
        self.output_dir = os.path.join(os.getcwd(), '.test')

        D = get_regression_datamanager()
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        configuration = configuration_space.sample_configuration()
        evaluator = HoldoutEvaluator(D, self.output_dir, configuration,
                                     with_predictions=True,
                                     all_scoring_functions=True,
                                     output_y_test=True)

        loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
            evaluator.fit_predict_and_loss()
        evaluator.file_output(loss, Y_optimization_pred, Y_valid_pred,
                              Y_test_pred)

        self.assertTrue(os.path.exists(os.path.join(
            self.output_dir, '.auto-sklearn', 'true_targets_ensemble.npy')))
コード例 #32
0
    def test_5000_classes(self):
        weights = ([0.0002] * 4750) + ([0.0001] * 250)
        X, Y = sklearn.datasets.make_classification(n_samples=10000,
                                                    n_features=20,
                                                    n_classes=5000,
                                                    n_clusters_per_class=1,
                                                    n_informative=15,
                                                    n_redundant=5,
                                                    n_repeated=0,
                                                    weights=weights,
                                                    flip_y=0,
                                                    class_sep=1.0,
                                                    hypercube=True,
                                                    shift=None,
                                                    scale=1.0,
                                                    shuffle=True,
                                                    random_state=1)

        self.assertEqual(250, np.sum(np.bincount(Y) == 1))
        D = Dummy()
        D.info = {
            'metric': ACC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {'X_train': X, 'Y_train': Y, 'X_valid': X, 'X_test': X}
        D.feat_type = ['numerical'] * 5000

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['no_preprocessing'])
        configuration = configuration_space.sample_configuration()
        D_ = copy.deepcopy(D)
        evaluator = HoldoutEvaluator(D_, configuration)
        evaluator.fit()
コード例 #33
0
    def test_5000_classes(self):
        weights = ([0.0002] * 4750) + ([0.0001] * 250)
        X, Y = sklearn.datasets.make_classification(n_samples=10000,
                                                    n_features=20,
                                                    n_classes=5000,
                                                    n_clusters_per_class=1,
                                                    n_informative=15,
                                                    n_redundant=5,
                                                    n_repeated=0,
                                                    weights=weights,
                                                    flip_y=0,
                                                    class_sep=1.0,
                                                    hypercube=True,
                                                    shift=None,
                                                    scale=1.0,
                                                    shuffle=True,
                                                    random_state=1)

        self.assertEqual(250, np.sum(np.bincount(Y) == 1))
        D = Dummy()
        D.info = {
            'metric': ACC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {'X_train': X, 'Y_train': Y, 'X_valid': X, 'X_test': X}
        D.feat_type = ['numerical'] * 5000

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['no_preprocessing'])
        configuration = configuration_space.sample_configuration()
        D_ = copy.deepcopy(D)
        evaluator = HoldoutEvaluator(D_, configuration)
        evaluator.fit()
コード例 #34
0
ファイル: config_space.py プロジェクト: amueller/openml-pimp
def get_config_space(classifier):
    if classifier is not 'neural_network':
        autosklearn_config_space = get_configuration_space(
            info={'task': autosklearn.constants.MULTICLASS_CLASSIFICATION, 'is_sparse': 0},
            include_estimators=[classifier],
            include_preprocessors=['no_preprocessing'])

        configuration_space = ConfigSpace.ConfigurationSpace()
        for name, hyperparameter in autosklearn_config_space._hyperparameters.items():
            if isinstance(hyperparameter, ConfigSpace.hyperparameters.Constant):
                continue
            if hyperparameter.name.startswith('classifier') or hyperparameter.name.startswith('imputation'):
                configuration_space.add_hyperparameter(hyperparameter)

        if classifier == 'random_forest':
            hyperparameter = configuration_space.get_hyperparameter('classifier:random_forest:max_features')
            hyperparameter.lower = 0.1
            hyperparameter.lower_hard = 0.1
            hyperparameter.upper = 0.9
            hyperparameter.upper_hard = 0.9
            hyperparameter.default = 0.1

        return configuration_space

    config_space = ConfigSpace.ConfigurationSpace()
    config_space.add_hyperparameter(ConfigSpace.CategoricalHyperparameter('imputation:strategy', ['mean', 'median', 'most_frequent']))
    config_space.add_hyperparameter(ConfigSpace.CategoricalHyperparameter('classifier:__choice__', [classifier]))
    config_space.add_hyperparameter(ConfigSpace.UniformIntegerHyperparameter('classifier:neural_network:hidden_layer_sizes', 32, 1024))
    config_space.add_hyperparameter(ConfigSpace.UniformIntegerHyperparameter('classifier:neural_network:num_hidden_layers', 1, 5))
    config_space.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter('classifier:neural_network:learning_rate_init', 0.00001, 1, log=True))
    config_space.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter('classifier:neural_network:alpha', 0.0000001, 0.0001, log=True))
   # config_space.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter('classifier:neural_network:beta_1', 0, 1))
   # config_space.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter('classifier:neural_network:beta_2', 0, 1))
   # config_space.add_hyperparameter(ConfigSpace.UniformIntegerHyperparameter('classifier:neural_network:max_iter', 2, 1000))
    config_space.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter('classifier:neural_network:momentum', 0.1, 0.9))
    return config_space
コード例 #35
0
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': ACC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info, include_estimators=['lda'], include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_,
                                          configuration,
                                          with_predictions=True,
                                          all_scoring_functions=True)

            if not self._fit(evaluator):
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_[ACC_METRIC]
            print(err[i], configuration['classifier:__choice__'])
            print(e_['outer:bac_metric'], e_[BAC_METRIC])

            # Test the outer CV
            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.outer_models), 5)
            self.assertTrue(
                all([model is not None for model in evaluator.outer_models]))

            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the predictions
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.1)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.1)
                num_models_better_than_random += 1

            # Test the inner CV
            self.assertEqual(len(evaluator.inner_models), 5)
            for fold in range(5):
                self.assertEqual(len(evaluator.inner_models[fold]), 5)
                self.assertTrue(
                    all([
                        model is not None
                        for model in evaluator.inner_models[fold]
                    ]))
                self.assertGreaterEqual(len(evaluator.outer_indices[fold][0]),
                                        75)
                for inner_fold in range(5):
                    self.assertGreaterEqual(
                        len(evaluator.inner_indices[fold][inner_fold][0]), 60)

        self.assertGreater(num_models_better_than_random, 9)
コード例 #36
0
def main():
    parser = ArgumentParser()

    parser.add_argument("configuration_directory",
                        metavar="configuration-directory")
    parser.add_argument("output_directory", metavar="output-directory")
    parser.add_argument("--cutoff",
                        type=int,
                        default=-1,
                        help="Only consider the validation performances up to "
                        "this time.")
    parser.add_argument("--num-runs", type=int, default=1)
    parser.add_argument("--only-best",
                        type=bool,
                        default=False,
                        help="Look only for the best configuration in the "
                        "validation files.")

    args = parser.parse_args()
    configuration_directory = args.configuration_directory
    output_dir = args.output_directory
    cutoff = int(args.cutoff)
    num_runs = args.num_runs

    for sparse, task in [(1, BINARY_CLASSIFICATION),
                         (1, MULTICLASS_CLASSIFICATION),
                         (0, BINARY_CLASSIFICATION),
                         (0, MULTICLASS_CLASSIFICATION)]:

        for metric in [
                'acc_metric', 'auc_metric', 'bac_metric', 'f1_metric',
                'pac_metric'
        ]:

            output_dir_ = os.path.join(
                output_dir, '%s_%s_%s' % (metric, TASK_TYPES_TO_STRING[task],
                                          'sparse' if sparse else 'dense'))

            configuration_space = pipeline.get_configuration_space({
                'is_sparse':
                sparse,
                'task':
                task
            })

            try:
                os.makedirs(output_dir_)
            except:
                pass

            outputs, configurations = retrieve_matadata(
                validation_directory=configuration_directory,
                num_runs=num_runs,
                metric=metric,
                cutoff=cutoff,
                configuration_space=configuration_space,
                only_best=args.only_best)

            if len(outputs) == 0:
                raise ValueError("Nothing found!")

            write_output(outputs, configurations, output_dir_,
                         configuration_space, metric)
コード例 #37
0
    def test_evaluate_multiclass_classification_partial_fit(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration, with_predictions=True)

            if not self._partial_fit(evaluator, fold=i % 10):
                print()
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_
            print(err[i], configuration['classifier:__choice__'])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that only one model was trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(
                1,
                np.sum([
                    True if model is not None else False
                    for model in evaluator.models
                ]))
            self.assertLess(Y_optimization_pred.shape[0], 13)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreaterEqual(num_models_better_than_random, 5)
コード例 #38
0
    def test_metalearning(self):
        dataset_name_classification = 'digits'
        initial_challengers_classification = {
            ACC_METRIC:
            "--initial-challengers \" "
            "-balancing:strategy 'weighting' "
            "-classifier:__choice__ 'proj_logit'",
            AUC_METRIC:
            "--initial-challengers \" "
            "-balancing:strategy 'weighting' "
            "-classifier:__choice__ 'liblinear_svc'",
            BAC_METRIC:
            "--initial-challengers \" "
            "-balancing:strategy 'weighting' "
            "-classifier:__choice__ 'proj_logit'",
            F1_METRIC:
            "--initial-challengers \" "
            "-balancing:strategy 'weighting' "
            "-classifier:__choice__ 'proj_logit'",
            PAC_METRIC:
            "--initial-challengers \" "
            "-balancing:strategy 'none' "
            "-classifier:__choice__ 'random_forest'"
        }

        dataset_name_regression = 'diabetes'
        initial_challengers_regression = {
            A_METRIC:
            "--initial-challengers \" "
            "-imputation:strategy 'mean' "
            "-one_hot_encoding:minimum_fraction '0.01' "
            "-one_hot_encoding:use_minimum_fraction 'True' "
            "-preprocessor:__choice__ 'no_preprocessing' "
            "-regressor:__choice__ 'random_forest'",
            R2_METRIC:
            "--initial-challengers \" "
            "-imputation:strategy 'mean' "
            "-one_hot_encoding:minimum_fraction '0.01' "
            "-one_hot_encoding:use_minimum_fraction 'True' "
            "-preprocessor:__choice__ 'no_preprocessing' "
            "-regressor:__choice__ 'random_forest'",
        }

        for dataset_name, task, initial_challengers in [
            (dataset_name_regression, REGRESSION,
             initial_challengers_regression),
            (dataset_name_classification, MULTICLASS_CLASSIFICATION,
             initial_challengers_classification)
        ]:
            for metric in initial_challengers:
                configuration_space = get_configuration_space(
                    {
                        'metric': metric,
                        'task': task,
                        'is_sparse': False
                    },
                    include_preprocessors=['no_preprocessing'])

                X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
                categorical = [False] * X_train.shape[1]

                meta_features_label = calc_meta_features(
                    X_train, Y_train, categorical, dataset_name, task)
                meta_features_encoded_label = calc_meta_features_encoded(
                    X_train, Y_train, categorical, dataset_name, task)

                initial_configuration_strings_for_smac = \
                    suggest_via_metalearning(
                        meta_features_label,
                        meta_features_encoded_label,
                        configuration_space, dataset_name, metric,
                        task, False, 1, None)

                print(METRIC_TO_STRING[metric])
                print(initial_configuration_strings_for_smac[0])
                self.assertTrue(
                    initial_configuration_strings_for_smac[0].startswith(
                        initial_challengers[metric]))
コード例 #39
0
    def test_metalearning(self):
        dataset_name_classification = 'digits'
        initial_challengers_classification = {
            ACC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            AUC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'liblinear_svc'",
            BAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            F1_METRIC: "--initial-challengers \" "
                       "-balancing:strategy 'weighting' "
                       "-classifier:__choice__ 'proj_logit'",
            PAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'"
        }

        dataset_name_regression = 'diabetes'
        initial_challengers_regression = {
            A_METRIC: "--initial-challengers \" "
                      "-imputation:strategy 'mean' "
                      "-one_hot_encoding:minimum_fraction '0.01' "
                      "-one_hot_encoding:use_minimum_fraction 'True' "
                      "-preprocessor:__choice__ 'no_preprocessing' "
                      "-regressor:__choice__ 'random_forest'",
            R2_METRIC: "--initial-challengers \" "
                       "-imputation:strategy 'mean' "
                       "-one_hot_encoding:minimum_fraction '0.01' "
                       "-one_hot_encoding:use_minimum_fraction 'True' "
                       "-preprocessor:__choice__ 'no_preprocessing' "
                       "-regressor:__choice__ 'random_forest'",
        }

        for dataset_name, task, initial_challengers in [
            (dataset_name_regression, REGRESSION,
             initial_challengers_regression),
            (dataset_name_classification, MULTICLASS_CLASSIFICATION,
             initial_challengers_classification)
            ]:
            for metric in initial_challengers:
                configuration_space = get_configuration_space(
                    {
                        'metric': metric,
                        'task': task,
                        'is_sparse': False
                    },
                    include_preprocessors=['no_preprocessing'])

                X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
                categorical = [False] * X_train.shape[1]

                meta_features_label = calc_meta_features(
                    X_train, Y_train, categorical, dataset_name, task)
                meta_features_encoded_label = calc_meta_features_encoded(
                    X_train, Y_train, categorical, dataset_name, task)

                initial_configuration_strings_for_smac = \
                    suggest_via_metalearning(
                        meta_features_label,
                        meta_features_encoded_label,
                        configuration_space, dataset_name, metric,
                        task, False, 1, None)

                print(metric)
                print(initial_configuration_strings_for_smac[0])
                self.assertTrue(initial_configuration_strings_for_smac[
                                    0].startswith(initial_challengers[metric]))
コード例 #40
0
    def test_evaluate_multiclass_classification_partial_fit(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration, with_predictions=True)

            if not self._partial_fit(evaluator, fold=i % 10):
                print()
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_
            print(err[i], configuration['classifier:__choice__'])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that only one model was trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(1, np.sum([True if model is not None else False
                                        for model in evaluator.models]))
            self.assertLess(Y_optimization_pred.shape[0], 13)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreaterEqual(num_models_better_than_random, 5)
コード例 #41
0
        
        # ======== Creating a data object with data, informations about it
        logger.info("========= Reading and converting data ==========")
        D = DataManager(basename, data_dir, max_samples=max_samples)
        logger.info(str(D))
        logger.info("[+] Size of uploaded data  %5.2f bytes" %
               data_io.total_size(D))
        overall_time_budget = min(max_time, D.info['time_budget'])
        
        # ======== Create auto-sklearn model
        new_info_object = {}
        new_info_object['is_sparse'] = D.info['is_sparse']
        new_info_object['task'] = STRING_TO_TASK_TYPES[D.info['task']]
        new_info_object['metric'] = STRING_TO_METRIC[D.info['metric']]

        configuration_space = get_configuration_space(new_info_object)
        try:
            config = ConfigSpace.Configuration(configuration_space, configuration)
        except Exception as inst:
            execution_success = False
            logger.critical(inst)
            continue

        logger.info("Running the following configuration:")
        logger.info(str(config))

        if 'classifier:__choice__' in configuration:
            M = SimpleClassificationPipeline(config, 1)
        elif 'regressor:__choice__' in configuration:
            M = SimpleRegressionPipeline(config, 1)
        else:
コード例 #42
0
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': ACC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_, configuration,
                                          with_predictions=True,
                                          all_scoring_functions=True)

            if not self._fit(evaluator):
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_[ACC_METRIC]
            print(err[i], configuration['classifier:__choice__'])
            print(e_['outer:bac_metric'], e_[BAC_METRIC])

            # Test the outer CV
            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.outer_models), 5)
            self.assertTrue(all([model is not None
                                 for model in evaluator.outer_models]))

            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the predictions
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.1)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.1)
                num_models_better_than_random += 1

            # Test the inner CV
            self.assertEqual(len(evaluator.inner_models), 5)
            for fold in range(5):
                self.assertEqual(len(evaluator.inner_models[fold]), 5)
                self.assertTrue(all([model is not None
                                     for model in evaluator.inner_models[fold]
                                     ]))
                self.assertGreaterEqual(len(evaluator.outer_indices[fold][0]),
                                        75)
                for inner_fold in range(5):
                    self.assertGreaterEqual(
                        len(evaluator.inner_indices[fold][inner_fold][0]), 60)

        self.assertGreater(num_models_better_than_random, 9)
コード例 #43
0
def main(dataset_info, mode, seed, params,
         mode_args=None, output_dir=None):
    """This command line interface has three different operation modes:

    * CV: useful for the Tweakathon
    * 1/3 test split: useful to evaluate a configuration
    * cv on 2/3 train split: useful to optimize hyperparameters in a training
      mode before testing a configuration on the 1/3 test split.

    It must by no means be used for the Auto part of the competition!
    """
    if mode_args is None:
        mode_args = {}

    if output_dir is None:
        output_dir = os.getcwd()

    if not isinstance(dataset_info, AbstractDataManager):
        D = store_and_or_load_data(dataset_info=dataset_info,
                                   outputdir=output_dir)
    else:
        D = dataset_info
    metric = D.info['metric']

    num_run = None
    if mode != 'test':
        num_run = get_new_run_num()

    if params is not None:
        for key in params:
            try:
                params[key] = int(params[key])
            except Exception:
                try:
                    params[key] = float(params[key])
                except Exception:
                    pass

        cs = get_configuration_space(D.info)
        configuration = configuration_space.Configuration(cs, params)
    else:
        configuration = None

    if seed is not None:
        seed = int(float(seed))
    else:
        seed = 1

    global evaluator

    if mode == 'holdout':
        make_mode_holdout(D, seed, configuration, num_run, output_dir)
    elif mode == 'holdout-iterative-fit':
        make_mode_holdout_iterative_fit(D, seed, configuration, num_run,
                                        output_dir)
    elif mode == 'test':
        make_mode_test(D, seed, configuration, metric, output_dir)
    elif mode == 'cv':
        make_mode_cv(D, seed, configuration, num_run, mode_args['folds'],
                     output_dir)
    elif mode == 'partial-cv':
        make_mode_partial_cv(D, seed, configuration, num_run,
                             metric, mode_args['fold'], mode_args['folds'],
                             output_dir)
    elif mode == 'nested-cv':
        make_mode_nested_cv(D, seed, configuration, num_run,
                            mode_args['inner_folds'], mode_args['outer_folds'],
                            output_dir)
    else:
        raise ValueError('Must choose a legal mode.')
コード例 #44
0
def main(dataset_info, mode, seed, params, mode_args=None):
    """This command line interface has three different operation modes:

    * CV: useful for the Tweakathon
    * 1/3 test split: useful to evaluate a configuration
    * cv on 2/3 train split: useful to optimize hyperparameters in a training
      mode before testing a configuration on the 1/3 test split.

    It must by no means be used for the Auto part of the competition!
    """
    if mode_args is None:
        mode_args = {}

    output_dir = os.getcwd()

    if not isinstance(dataset_info, AbstractDataManager):
        D = store_and_or_load_data(dataset_info=dataset_info,
                                   outputdir=output_dir)
    else:
        D = dataset_info
    metric = D.info['metric']

    num_run = None
    if mode != 'test':
        num_run = get_new_run_num()

    if params is not None:
        for key in params:
            try:
                params[key] = int(params[key])
            except Exception:
                try:
                    params[key] = float(params[key])
                except Exception:
                    pass

        cs = get_configuration_space(D.info)
        configuration = configuration_space.Configuration(cs, params)
    else:
        configuration = None

    if seed is not None:
        seed = int(float(seed))
    else:
        seed = 1

    global evaluator

    if mode == 'holdout':
        make_mode_holdout(D, seed, configuration, num_run)
    elif mode == 'holdout-iterative-fit':
        make_mode_holdout_iterative_fit(D, seed, configuration, num_run)
    elif mode == 'test':
        make_mode_test(D, seed, configuration, metric)
    elif mode == 'cv':
        make_mode_cv(D, seed, configuration, num_run, mode_args['folds'])
    elif mode == 'partial-cv':
        make_mode_partial_cv(D, seed, configuration, num_run, metric,
                             mode_args['fold'], mode_args['folds'])
    elif mode == 'nested-cv':
        make_mode_nested_cv(D, seed, configuration, num_run,
                            mode_args['inner_folds'], mode_args['outer_folds'])
    else:
        raise ValueError('Must choose a legal mode.')