Esempio n. 1
0
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['densifier']},
            dataset_properties={'sparse': True})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default,
                         'qda')

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['nystroem_sampler']})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default,
                         'sgd')
Esempio n. 2
0
 def test_default_configuration(self):
     for i in range(2):
         cs = ParamSklearnClassifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = ParamSklearnClassifier(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Esempio n. 3
0
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(
            self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['densifier']},
            dataset_properties={'sparse': True})
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'qda')

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['nystroem_sampler']})
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
Esempio n. 4
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = ParamSklearnClassifier(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 5
0
def _get_classification_configuration_space(info, include):
    task_type = info['task']

    multilabel = False
    multiclass = False
    sparse = False

    if task_type == MULTILABEL_CLASSIFICATION:
        multilabel = True
    if task_type == REGRESSION:
        raise NotImplementedError()
    if task_type == MULTICLASS_CLASSIFICATION:
        multiclass = True
    if task_type == BINARY_CLASSIFICATION:
        pass

    if info['is_sparse'] == 1:
        sparse = True

    dataset_properties = {
        'multilabel': multilabel,
        'multiclass': multiclass,
        'sparse': sparse
    }

    return ParamSklearnClassifier.get_hyperparameter_search_space(
        dataset_properties=dataset_properties,
        include=include)
def _get_classification_configuration_space(info,
                                            include_estimators=None,
                                            include_preprocessors=None):
    task_type = info['task']

    multilabel = False
    multiclass = False
    sparse = False

    if task_type == MULTILABEL_CLASSIFICATION:
        multilabel = True
    if task_type == REGRESSION:
        raise NotImplementedError()
    if task_type == MULTICLASS_CLASSIFICATION:
        multiclass = True
        pass
    if task_type == BINARY_CLASSIFICATION:
        pass

    if info['is_sparse'] == 1:
        sparse = True

    dataset_properties = {
        'multilabel': multilabel,
        'multiclass': multiclass,
        'sparse': sparse
    }

    return ParamSklearnClassifier.get_hyperparameter_search_space(
        dataset_properties=dataset_properties,
        include_estimators=include_estimators,
        include_preprocessors=include_preprocessors)
Esempio n. 7
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 8
0
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'classifier': ['libsvm_svc']})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__'),
            CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            exclude={'classifier': ['libsvm_svc']})
        self.assertNotIn('libsvm_svc', str(cs))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['select_percentile_classification']})
        self.assertEqual(cs.get_hyperparameter('preprocessor:__choice__'),
            CategoricalHyperparameter('preprocessor:__choice__',
                                      ['select_percentile_classification']))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            exclude={'preprocessor': ['select_percentile_classification']})
        self.assertNotIn('select_percentile_classification', str(cs))
Esempio n. 9
0
    def test_get_hyperparameter_search_space_dataset_properties(self):
        cs_mc = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'multiclass': True})
        self.assertNotIn('bernoulli_nb', str(cs_mc))

        cs_ml = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True})
        self.assertNotIn('k_nearest_neighbors', str(cs_ml))
        self.assertNotIn('liblinear', str(cs_ml))
        self.assertNotIn('libsvm_svc', str(cs_ml))
        self.assertNotIn('sgd', str(cs_ml))

        cs_sp = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        self.assertIn('extra_trees', str(cs_sp))
        self.assertIn('gradient_boosting', str(cs_sp))
        self.assertIn('random_forest', str(cs_sp))

        cs_mc_ml = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True, 'multiclass': True})
        self.assertEqual(cs_ml, cs_mc_ml)
Esempio n. 10
0
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'classifier': ['libsvm_svc']})
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__'),
            CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            exclude={'classifier': ['libsvm_svc']})
        self.assertNotIn('libsvm_svc', str(cs))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            include={'preprocessor': ['select_percentile_classification']})
        self.assertEqual(
            cs.get_hyperparameter('preprocessor:__choice__'),
            CategoricalHyperparameter('preprocessor:__choice__',
                                      ['select_percentile_classification']))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            exclude={'preprocessor': ['select_percentile_classification']})
        self.assertNotIn('select_percentile_classification', str(cs))
Esempio n. 11
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 12
0
    def test_get_hyperparameter_search_space_dataset_properties(self):
        cs_mc = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'multiclass': True})
        self.assertNotIn('bernoulli_nb', str(cs_mc))

        cs_ml = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True})
        self.assertNotIn('k_nearest_neighbors', str(cs_ml))
        self.assertNotIn('liblinear', str(cs_ml))
        self.assertNotIn('libsvm_svc', str(cs_ml))
        self.assertNotIn('sgd', str(cs_ml))

        cs_sp = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        self.assertIn('extra_trees', str(cs_sp))
        self.assertIn('gradient_boosting', str(cs_sp))
        self.assertIn('random_forest', str(cs_sp))

        cs_mc_ml = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={
                'multilabel': True,
                'multiclass': True
            })
        self.assertEqual(cs_ml, cs_mc_ml)
Esempio n. 13
0
 def test_default_configuration(self):
     for i in range(2):
         cs = ParamSklearnClassifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = ParamSklearnClassifier(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(
             0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Esempio n. 14
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = ParamSklearnClassifier(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 15
0
    def test_get_hyperparameter_search_space(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()

        self.assertEqual(len(cs.get_hyperparameter(
            'rescaling:__choice__').choices), 4)
        self.assertEqual(len(cs.get_hyperparameter(
            'classifier:__choice__').choices), 16)
        self.assertEqual(len(cs.get_hyperparameter(
            'preprocessor:__choice__').choices), 14)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(145, len(hyperparameters))

        #for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four parameters which are always active are classifier,
        # preprocessor, imputation strategy and scaling strategy
        self.assertEqual(len(hyperparameters) - 6, len(conditions))
Esempio n. 16
0
    def test_get_hyperparameter_search_space(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()

        self.assertEqual(
            len(cs.get_hyperparameter('rescaling:__choice__').choices), 4)
        self.assertEqual(
            len(cs.get_hyperparameter('classifier:__choice__').choices), 16)
        self.assertEqual(
            len(cs.get_hyperparameter('preprocessor:__choice__').choices), 14)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(145, len(hyperparameters))

        #for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four parameters which are always active are classifier,
        # preprocessor, imputation strategy and scaling strategy
        self.assertEqual(len(hyperparameters) - 6, len(conditions))
Esempio n. 17
0
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            categorical = [True, True, True, False, False, True, True, True,
                           False, True, True, True, True, True, True, True,
                           True, True, True, True, True, True, True, True, True,
                           True, True, True, True, True, True, True, False,
                           False, False, True, True, True]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(os.path.join(this_directory, "components",
                                        "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:,:-1]
            X_train, X_test, Y_train, Y_test = \
                sklearn.cross_validation.train_test_split(X, y)

            cls = ParamSklearnClassifier(config, random_state=1,)
            try:
                cls.fit(X_train, Y_train,
                        init_params={'one_hot_encoding:categorical_features': categorical})
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                    e.args[0] or \
                    "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 18
0
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            categorical = [
                True, True, True, False, False, True, True, True, False, True,
                True, True, True, True, True, True, True, True, True, True,
                True, True, True, True, True, True, True, True, True, True,
                True, True, False, False, False, True, True, True
            ]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(
                os.path.join(this_directory, "components",
                             "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:, :-1]
            X_train, X_test, Y_train, Y_test = \
                sklearn.cross_validation.train_test_split(X, y)

            cls = ParamSklearnClassifier(
                config,
                random_state=1,
            )
            try:
                cls.fit(X_train,
                        Y_train,
                        init_params={
                            'one_hot_encoding:categorical_features':
                            categorical
                        })
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                    e.args[0] or \
                    "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 19
0
    def test_weighting_effect(self):
        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.692, 0.719),
                 ('decision_tree', DecisionTree, 0.712, 0.668),
                 ('extra_trees', ExtraTreesClassifier, 0.901, 0.919),
                 ('gradient_boosting', GradientBoostingClassifier, 0.879, 0.883),
                 ('random_forest', RandomForest, 0.886, 0.885),
                 ('libsvm_svc', LibSVM_SVC, 0.915, 0.937),
                 ('liblinear_svc', LibLinear_SVC, 0.920, 0.923),
                 ('sgd', SGD, 0.811, 0.902)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)

                # pre_transform and fit_estimator
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, fit_params=fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                  ExtraTreesPreprocessor, 0.892, 0.910),
                   ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.906, 0.909)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:

                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={
                        'classifier': ['sgd'],
                        'preprocessor': [name]
                    })
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)

                # pre_transform and fit_estimator
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={
                        'classifier': ['sgd'],
                        'preprocessor': [name]
                    })
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, fit_params=fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)
Esempio n. 20
0
D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
X = D.data['X_train']
y = D.data['Y_train']
X_valid = D.data['X_valid']
X_test = D.data['X_test']

# Replace the following array by a new ensemble
choices = \
    [(1.0, ParamSklearnClassifier(configuration={
        'balancing:strategy': 'weighting',
        'classifier:__choice__': 'sgd',
        'classifier:sgd:loss': 'hinge',
        'classifier:sgd:penalty': 'l2',
        'classifier:sgd:alpha': 0.0001,
        'classifier:sgd:fit_intercept': True,
        'classifier:sgd:n_iter': 5,
        'classifier:sgd:learning_rate': 'optimal',
        'classifier:sgd:eta0': 0.01,
        'classifier:sgd:average': True,
        'imputation:strategy': 'mean',
        'one_hot_encoding:use_minimum_fraction': 'True',
        'one_hot_encoding:minimum_fraction': 0.1,
        'preprocessor:__choice__': 'no_preprocessing',
        'rescaling:__choice__': 'min/max'}))]

classifiers = []
targets = []
predictions = []
predictions_valid = []
predictions_test = []

# Make predictions and weight them
Esempio n. 21
0
    def test_predict_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = ParamSklearnClassifier(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 22
0
    def test_multilabel(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        dataset_properties = {'multilabel': True}
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(dataset_properties=dataset_properties)

        print(cs)
        cs.seed(5)

        for i in range(50):
            X, Y = sklearn.datasets.\
                    make_multilabel_classification(n_samples=150,
                                                   n_features=20,
                                                   n_classes=5,
                                                   n_labels=2,
                                                   length=50,
                                                   allow_unlabeled=True,
                                                   sparse=False,
                                                   return_indicator=True,
                                                   return_distributions=False,
                                                   random_state=1)
            X_train = X[:100, :]
            Y_train = Y[:100, :]
            X_test = X[101:, :]
            Y_test = Y[101:, ]

            config = cs.sample_configuration()
            config._populate_values()

            if 'classifier:passive_aggressive:n_iter' in config:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config:
                config._values['classifier:sgd:n_iter'] = 5

            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabilities = cls.predict_proba(X_test_)
                [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities]
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0] or \
                        "removed all features" in e.args[0] or \
                        "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 23
0
    def test_predict_proba_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 24
0
    def test_predict_proba_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 25
0
    def test_predict_proba_batched_sparse(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": 'True',
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                "rescaling:__choice__": "min/max"
            })

        # Multiclass
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 26
0
 def test_repr(self):
     cs = ParamSklearnClassifier.get_hyperparameter_search_space()
     default = cs.get_default_configuration()
     representation = repr(ParamSklearnClassifier(default))
     cls = eval(representation)
     self.assertIsInstance(cls, ParamSklearnClassifier)
Esempio n. 27
0
    def test_multilabel(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        dataset_properties = {'multilabel': True}
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties=dataset_properties)

        print(cs)
        cs.seed(5)

        for i in range(50):
            X, Y = sklearn.datasets.\
                    make_multilabel_classification(n_samples=150,
                                                   n_features=20,
                                                   n_classes=5,
                                                   n_labels=2,
                                                   length=50,
                                                   allow_unlabeled=True,
                                                   sparse=False,
                                                   return_indicator=True,
                                                   return_distributions=False,
                                                   random_state=1)
            X_train = X[:100, :]
            Y_train = Y[:100, :]
            X_test = X[101:, :]
            Y_test = Y[101:, ]

            config = cs.sample_configuration()
            config._populate_values()

            if 'classifier:passive_aggressive:n_iter' in config:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config:
                config._values['classifier:sgd:n_iter'] = 5

            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabilities = cls.predict_proba(X_test_)
                [
                    self.assertIsInstance(i, np.ndarray)
                    for i in predicted_probabilities
                ]
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0] or \
                        "removed all features" in e.args[0] or \
                        "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 28
0
    def test_predict_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = ParamSklearnClassifier(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
y = D.data['Y_train']
X_valid = D.data['X_valid']
X_test = D.data['X_test']

# Replace the following array by a new ensemble
choices = \
    [(0.480000, ParamSklearnClassifier(configuration={
        'balancing:strategy': 'none',
        'classifier:__choice__': 'random_forest',
        'classifier:random_forest:bootstrap': 'True',
        'classifier:random_forest:criterion': 'entropy',
        'classifier:random_forest:max_depth': 'None',
        'classifier:random_forest:max_features': 4.885151102990943,
        'classifier:random_forest:max_leaf_nodes': 'None',
        'classifier:random_forest:min_samples_leaf': 2,
        'classifier:random_forest:min_samples_split': 2,
        'classifier:random_forest:min_weight_fraction_leaf': 0.0,
        'classifier:random_forest:n_estimators': 100,
        'imputation:strategy': 'median',
        'one_hot_encoding:minimum_fraction': 0.059297498551361,
        'one_hot_encoding:use_minimum_fraction': 'True',
        'preprocessor:__choice__': 'gem',
        'preprocessor:gem:N': 13,
        'preprocessor:gem:precond': 0.31299029323203487,
        'rescaling:__choice__': 'min/max'})),
     (0.300000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'none',
            'classifier:__choice__': 'random_forest',
            'classifier:random_forest:bootstrap': 'False',
            'classifier:random_forest:criterion': 'entropy',
Esempio n. 30
0
    def test_predict_proba_batched_sparse(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(cs,
                               values={"balancing:strategy": "none",
                                       "classifier:__choice__": "random_forest",
                                       "imputation:strategy": "mean",
                                       "one_hot_encoding:minimum_fraction": 0.01,
                                       "one_hot_encoding:use_minimum_fraction": 'True',
                                       "preprocessor:__choice__": "no_preprocessing",
                                       'classifier:random_forest:bootstrap': 'True',
                                       'classifier:random_forest:criterion': 'gini',
                                       'classifier:random_forest:max_depth': 'None',
                                       'classifier:random_forest:min_samples_split': 2,
                                       'classifier:random_forest:min_samples_leaf': 2,
                                       'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                                       'classifier:random_forest:max_features': 0.5,
                                       'classifier:random_forest:max_leaf_nodes': 'None',
                                       'classifier:random_forest:n_estimators': 100,
                                       "rescaling:__choice__": "min/max"})

        # Multiclass
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
y = D.data['Y_train']
X_valid = D.data['X_valid']
X_test = D.data['X_test']

# Replace the following array by a new ensemble
choices = \
    [(0.580000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'weighting',
            'classifier:__choice__': 'extra_trees',
            'classifier:extra_trees:bootstrap': 'True',
            'classifier:extra_trees:criterion': 'gini',
            'classifier:extra_trees:max_depth': 'None',
            'classifier:extra_trees:max_features': 1.4927328322706173,
            'classifier:extra_trees:min_samples_leaf': 1,
            'classifier:extra_trees:min_samples_split': 5,
            'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
            'classifier:extra_trees:n_estimators': 100,
            'imputation:strategy': 'mean',
            'one_hot_encoding:use_minimum_fraction': 'False',
            'preprocessor:__choice__': 'select_rates',
            'preprocessor:select_rates:alpha': 0.4308279694614349,
            'preprocessor:select_rates:mode': 'fwe',
            'preprocessor:select_rates:score_func': 'f_classif',
            'rescaling:__choice__': 'min/max'})),
     (0.200000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'none',
            'classifier:__choice__': 'sgd',
            'classifier:sgd:alpha': 5.707045187542232e-06,
            'classifier:sgd:average': 'True',
Esempio n. 32
0
 def test_repr(self):
     cs = ParamSklearnClassifier.get_hyperparameter_search_space()
     default = cs.get_default_configuration()
     representation = repr(ParamSklearnClassifier(default))
     cls = eval(representation)
     self.assertIsInstance(cls, ParamSklearnClassifier)
Esempio n. 33
0
from ParamSklearn.classification import ParamSklearnClassifier
from HPOlibConfigSpace.random_sampler import RandomSampler
import sklearn.datasets
import sklearn.metrics
import numpy as np

iris = sklearn.datasets.load_iris()
X = iris.data
Y = iris.target
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
configuration_space = ParamSklearnClassifier.get_hyperparameter_search_space()
sampler = RandomSampler(configuration_space, 1)
for i in range(10000):
    configuration = sampler.sample_configuration()
    auto = ParamSklearnClassifier(configuration)
    try:
        auto = auto.fit(X[indices[:100]], Y[indices[:100]])
    except Exception as e:
        print configuration
        print e
        continue
    predictions = auto.predict(X[indices[100:]])
    print sklearn.metrics.accuracy_score(predictions, Y[indices[100:]])
classifiers = []
predictions_valid = []
predictions_test = []

# Make predictions and weight them
for weight, configuration in zip(weights, configurations):
    for param in configuration:
        try:
            configuration[param] = int(configuration[param])
        except Exception:
            try:
                configuration[param] = float(configuration[param])
            except Exception:
                pass

    classifier = ParamSklearnClassifier(configuration, 1)
    classifiers.append(classifier)
    try:
        classifier.fit(X.copy(), y.copy())
        predictions_valid.append(
            classifier.predict_proba(X_valid.copy()) * weight)
        predictions_test.append(
            classifier.predict_proba(X_test.copy()) * weight)
    except Exception as e:
        print e
        print configuration

# Output the predictions
for name, predictions in [('valid', predictions_valid),
                          ('test', predictions_test)]:
    predictions = np.array(predictions)
Esempio n. 35
0
    def test_weighting_effect(self):
        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.692, 0.719),
                 ('decision_tree', DecisionTree, 0.712, 0.668),
                 ('extra_trees', ExtraTreesClassifier, 0.901, 0.919),
                 ('gradient_boosting', GradientBoostingClassifier, 0.879, 0.883),
                 ('random_forest', RandomForest, 0.886, 0.885),
                 ('libsvm_svc', LibSVM_SVC, 0.915, 0.937),
                 ('liblinear_svc', LibLinear_SVC, 0.920, 0.923),
                 ('sgd', SGD, 0.811, 0.902)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                    sklearn.metrics.accuracy_score(predictions, Y_test),
                    places=3)

                # pre_transform and fit_estimator
                X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, fit_params=fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                  ExtraTreesPreprocessor, 0.892, 0.910),
                   ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.906, 0.909)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:

                X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)

                # pre_transform and fit_estimator
                X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
                cs = ParamSklearnClassifier.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = ParamSklearnClassifier(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, fit_params=fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.accuracy_score(
                                           predictions, Y_test),
                                       places=3)
X_valid = D.data['X_valid']
X_test = D.data['X_test']

# Replace the following array by a new ensemble
choices = \
    [(0.140000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'none',
            'classifier:__choice__': 'random_forest',
            'classifier:random_forest:bootstrap': 'False',
            'classifier:random_forest:criterion': 'gini',
            'classifier:random_forest:max_depth': 'None',
            'classifier:random_forest:max_features': 4.649151092701434,
            'classifier:random_forest:max_leaf_nodes': 'None',
            'classifier:random_forest:min_samples_leaf': 3,
            'classifier:random_forest:min_samples_split': 5,
            'classifier:random_forest:min_weight_fraction_leaf': 0.0,
            'classifier:random_forest:n_estimators': 100,
            'imputation:strategy': 'most_frequent',
            'one_hot_encoding:minimum_fraction': 0.006861808529548735,
            'one_hot_encoding:use_minimum_fraction': 'True',
            'preprocessor:__choice__': 'select_rates',
            'preprocessor:select_rates:alpha': 0.03408255008474342,
            'preprocessor:select_rates:mode': 'fwe',
            'preprocessor:select_rates:score_func': 'f_classif',
            'rescaling:__choice__': 'normalize'})),
     (0.100000, ParamSklearnClassifier(
         configuration={
             'balancing:strategy': 'weighting',
             'classifier:__choice__': 'random_forest',
             'classifier:random_forest:bootstrap': 'False',
classifiers = []
predictions_valid = []
predictions_test = []

# Make predictions and weight them
for weight, configuration in zip(weights, configurations):
    for param in configuration:
        try:
            configuration[param] = int(configuration[param])
        except Exception:
            try:
                configuration[param] = float(configuration[param])
            except Exception:
                pass

    classifier = ParamSklearnClassifier(configuration, 1)
    classifiers.append(classifier)
    try:
        classifier.fit(X.copy(), y.copy())
        predictions_valid.append(classifier.predict_proba(X_valid.copy()) * weight)
        predictions_test.append(classifier.predict_proba(X_test.copy()) * weight)
    except Exception as e:
        print e
        print configuration

# Output the predictions
for name, predictions in [('valid', predictions_valid),
                          ('test', predictions_test)]:
    predictions = np.array(predictions)
    predictions = np.sum(predictions, axis=0)
    predictions = predictions[:, 1].reshape((-1, 1))
Esempio n. 38
0
def get_model(configuration, seed):
    if 'classifier' in configuration:
        return ParamSklearnClassifier(configuration, seed)
    elif 'regressor' in configuration:
        return ParamSklearnRegressor(configuration, seed)
Esempio n. 39
0
D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
X = D.data['X_train']
y = D.data['Y_train']
X_valid = D.data['X_valid']
X_test = D.data['X_test']

# Replace the following array by a new ensemble
choices = \
    [(0.220000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'weighting',
            'classifier:__choice__': 'passive_aggressive',
            'classifier:passive_aggressive:C': 0.0022574783522003694,
            'classifier:passive_aggressive:fit_intercept': 'True',
            'classifier:passive_aggressive:loss': 'hinge',
            'classifier:passive_aggressive:n_iter': 119,
            'imputation:strategy': 'most_frequent',
            'one_hot_encoding:minimum_fraction': 0.1898871876010834,
            'one_hot_encoding:use_minimum_fraction': 'True',
            'preprocessor:__choice__': 'gem',
            'preprocessor:gem:N': 20,
            'preprocessor:gem:precond': 0.27540716190663134,
            'rescaling:__choice__': 'min/max'})),
     (0.160000, ParamSklearnClassifier(
        configuration={
            'balancing:strategy': 'none',
            'classifier:__choice__': 'passive_aggressive',
            'classifier:passive_aggressive:C': 8.011168723835382,
            'classifier:passive_aggressive:fit_intercept': 'True',
            'classifier:passive_aggressive:loss': 'hinge',
            'classifier:passive_aggressive:n_iter': 20,