def test_predict_proba_batched(self):
        # Multiclass
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')

        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.94,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Esempio n. 3
0
 def test_default_configuration_multilabel(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True})
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris", make_multilabel=True)
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
 def test_default_configuration_iterative_fit(self):
     classifier = SimpleClassificationPipeline(
         include={'classifier': ['random_forest'],
                  'preprocessor': ['no_preprocessing']})
     X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
     XT = classifier.fit_transformer(X_train, Y_train)
     for i in range(1, 11):
         classifier.iterative_fit(X_train, Y_train)
         self.assertEqual(classifier.steps[-1][-1].choice.estimator.n_estimators,
                          i)
Esempio n. 5
0
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={"preprocessor": ["densifier"]}, dataset_properties={"sparse": True}
        )
        self.assertEqual(cs.get_hyperparameter("classifier:__choice__").default, "qda")

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={"preprocessor": ["nystroem_sampler"]}
        )
        self.assertEqual(cs.get_hyperparameter("classifier:__choice__").default, "sgd")
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'preprocessor': ['densifier']},
            dataset_properties={'sparse': True})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default,
                         'qda')

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'preprocessor': ['nystroem_sampler']})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default,
                         'sgd')
Esempio n. 7
0
 def test_add_preprocessor(self):
     self.assertEqual(len(preprocessing_components._addons.components), 0)
     preprocessing_components.add_preprocessor(DummyPreprocessor)
     self.assertEqual(len(preprocessing_components._addons.components), 1)
     cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
     self.assertIn("DummyPreprocessor", str(cs))
     del preprocessing_components._addons.components["DummyPreprocessor"]
Esempio n. 8
0
 def test_add_classifier(self):
     self.assertEqual(len(classification_components._addons.components), 0)
     classification_components.add_classifier(DummyClassifier)
     self.assertEqual(len(classification_components._addons.components), 1)
     cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
     self.assertIn("DummyClassifier", str(cs))
     del classification_components._addons.components["DummyClassifier"]
Esempio n. 9
0
def _get_classification_configuration_space(info, include):
    task_type = info['task']

    multilabel = False
    multiclass = False
    sparse = False

    if task_type == MULTILABEL_CLASSIFICATION:
        multilabel = True
    if task_type == REGRESSION:
        raise NotImplementedError()
    if task_type == MULTICLASS_CLASSIFICATION:
        multiclass = True
    if task_type == BINARY_CLASSIFICATION:
        pass

    if info['is_sparse'] == 1:
        sparse = True

    dataset_properties = {
        'multilabel': multilabel,
        'multiclass': multiclass,
        'sparse': sparse
    }

    return SimpleClassificationPipeline.get_hyperparameter_search_space(
        dataset_properties=dataset_properties,
        include=include)
Esempio n. 10
0
    def test_predict_batched_sparse(self):
        cls = SimpleClassificationPipeline(dataset_properties={'sparse': True},
                                           include={'classifier': ['sgd']})

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'classifier': ['libsvm_svc']})
        self.assertEqual(cs.get_hyperparameter('classifier:__choice__'),
            CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            exclude={'classifier': ['libsvm_svc']})
        self.assertNotIn('libsvm_svc', str(cs))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'preprocessor': ['select_percentile_classification']})
        self.assertEqual(cs.get_hyperparameter('preprocessor:__choice__'),
            CategoricalHyperparameter('preprocessor:__choice__',
                                      ['select_percentile_classification']))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            exclude={'preprocessor': ['select_percentile_classification']})
        self.assertNotIn('select_percentile_classification', str(cs))
Esempio n. 12
0
    def test_get_hyperparameter_search_space_dataset_properties(self):
        cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multiclass": True})
        self.assertNotIn("bernoulli_nb", str(cs_mc))

        cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True})
        self.assertNotIn("k_nearest_neighbors", str(cs_ml))
        self.assertNotIn("liblinear", str(cs_ml))
        self.assertNotIn("libsvm_svc", str(cs_ml))
        self.assertNotIn("sgd", str(cs_ml))

        cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})
        self.assertIn("extra_trees", str(cs_sp))
        self.assertIn("gradient_boosting", str(cs_sp))
        self.assertIn("random_forest", str(cs_sp))

        cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={"multilabel": True, "multiclass": True}
        )
        self.assertEqual(cs_ml, cs_mc_ml)
Esempio n. 13
0
    def test_get_hyperparameter_search_space_dataset_properties(self):
        cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'multiclass': True})
        self.assertNotIn('bernoulli_nb', str(cs_mc))

        cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True})
        self.assertNotIn('k_nearest_neighbors', str(cs_ml))
        self.assertNotIn('liblinear', str(cs_ml))
        self.assertNotIn('libsvm_svc', str(cs_ml))
        self.assertNotIn('sgd', str(cs_ml))

        cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        self.assertIn('extra_trees', str(cs_sp))
        self.assertIn('gradient_boosting', str(cs_sp))
        self.assertIn('random_forest', str(cs_sp))

        cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True, 'multiclass': True})
        self.assertEqual(cs_ml, cs_mc_ml)
Esempio n. 14
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 15
0
 def test_categorical_passed_to_one_hot_encoder(self, ohe_mock):
     cls = SimpleClassificationPipeline(
         init_params={
             'categorical_encoding:one_hot_encoding:categorical_features':
                 [True, False]
         }
     )
     self.assertEqual(
         ohe_mock.call_args[1]['init_params'],
         {'one_hot_encoding:categorical_features': [True, False]}
     )
     default = cls.get_hyperparameter_search_space().get_default_configuration()
     cls.set_hyperparameters(configuration=default,
         init_params={
             'categorical_encoding:one_hot_encoding:categorical_features':
                 [True, True, False]
         }
     )
     self.assertEqual(
         ohe_mock.call_args[1]['init_params'],
         {'one_hot_encoding:categorical_features': [True, True, False]}
     )
Esempio n. 16
0
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(include={"classifier": ["libsvm_svc"]})
        self.assertEqual(
            cs.get_hyperparameter("classifier:__choice__"),
            CategoricalHyperparameter("classifier:__choice__", ["libsvm_svc"]),
        )

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(exclude={"classifier": ["libsvm_svc"]})
        self.assertNotIn("libsvm_svc", str(cs))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={"preprocessor": ["select_percentile_classification"]}
        )
        self.assertEqual(
            cs.get_hyperparameter("preprocessor:__choice__"),
            CategoricalHyperparameter("preprocessor:__choice__", ["select_percentile_classification"]),
        )

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            exclude={"preprocessor": ["select_percentile_classification"]}
        )
        self.assertNotIn("select_percentile_classification", str(cs))
 def test_default_configuration_multilabel(self):
     for i in range(2):
         dataset_properties = {'multilabel': True}
         classifier = SimpleClassificationPipeline(
             dataset_properties=dataset_properties)
         cs = classifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(
             dataset='iris', make_multilabel=True)
         classifier.set_hyperparameters(default)
         classifier = classifier.fit(X_train, Y_train)
         predictions = classifier.predict(X_test)
         self.assertAlmostEqual(
             0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = classifier.predict_proba(X_test)
Esempio n. 18
0
    def test_pipeline_clonability(self):
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
        auto = SimpleClassificationPipeline()
        auto = auto.fit(X_train, Y_train)
        auto_clone = clone(auto)
        auto_clone_params = auto_clone.get_params()

        # Make sure all keys are copied properly
        for k, v in auto.get_params().items():
            self.assertIn(k, auto_clone_params)

        # Make sure the params getter of estimator are honored
        klass = auto.__class__
        new_object_params = auto.get_params(deep=False)
        for name, param in new_object_params.items():
            new_object_params[name] = clone(param, safe=False)
        new_object = klass(**new_object_params)
        params_set = new_object.get_params(deep=False)

        for name in new_object_params:
            param1 = new_object_params[name]
            param2 = params_set[name]
            self.assertEqual(param1, param2)
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = SimpleClassificationPipeline(include={'classifier': ['libsvm_svc']})\
            .get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__'),
            CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']))

        cs = SimpleClassificationPipeline(exclude={'classifier': ['libsvm_svc']}).\
            get_hyperparameter_search_space()
        self.assertNotIn('libsvm_svc', str(cs))

        cs = SimpleClassificationPipeline(
            include={'preprocessor': ['select_percentile_classification']}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('preprocessor:__choice__'),
            CategoricalHyperparameter('preprocessor:__choice__',
                                      ['select_percentile_classification']))

        cs = SimpleClassificationPipeline(exclude={
            'preprocessor': ['select_percentile_classification']
        }).get_hyperparameter_search_space()
        self.assertNotIn('select_percentile_classification', str(cs))
Esempio n. 20
0
def max_estimators_fit_duration(X,
                                y,
                                max_classifier_time_budget,
                                logger,
                                sample_factor=1):
    lo = utl.get_logger(inspect.stack()[0][3])

    lo.info("Constructing preprocessor pipeline and transforming sample data")
    # we don't care about the data here but need to preprocess, otherwise the classifiers crash

    pipeline = SimpleClassificationPipeline(include={
        'imputation': ['most_frequent'],
        'rescaling': ['standardize']
    })
    default_cs = pipeline.get_hyperparameter_search_space(
    ).get_default_configuration()
    pipeline = pipeline.set_hyperparameters(default_cs)

    pipeline.fit(X, y)
    X_tr, dummy = pipeline.fit_transformer(X, y)

    lo.info("Running estimators on the sample")
    # going over all default classifiers used by auto-sklearn
    clfs = autosklearn.pipeline.components.classification._classifiers

    processes = []
    with multiprocessing.Manager() as manager:
        max_clf_time = manager.Value('i', 3)  # default 3 sec
        for clf_name, clf_class in clfs.items():
            pr = multiprocessing.Process(target=time_single_estimator,
                                         name=clf_name,
                                         args=(clf_name, clf_class, X_tr, y,
                                               max_clf_time, logger))
            pr.start()
            processes.append(pr)
        for pr in processes:
            pr.join(max_classifier_time_budget
                    )  # will block for max_classifier_time_budget or
            # until the classifier fit process finishes. After max_classifier_time_budget
            # we will terminate all still running processes here.
            if pr.is_alive():
                logger.info("Terminating " + pr.name +
                            " process due to timeout")
                pr.terminate()
        result_max_clf_time = max_clf_time.value

    lo.info("Test classifier fit completed")

    per_run_time_limit = int(sample_factor * result_max_clf_time)
    return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
Esempio n. 21
0
    def test_categorical_passed_to_one_hot_encoder(self, ohe_mock):

        # Mock the _check_init_params_honored as there is no object created,
        # _check_init_params_honored will fail as a datapreprocessor was never created
        with unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline'
                                 '._check_init_params_honored'):
            cls = SimpleClassificationPipeline(
                init_params={'data_preprocessing:categorical_features': [True, False]}
            )

            self.assertEqual(
                ohe_mock.call_args[1]['init_params'],
                {'categorical_features': [True, False]}
            )
            default = cls.get_hyperparameter_search_space().get_default_configuration()
            cls.set_hyperparameters(
                configuration=default,
                init_params={'data_preprocessing:categorical_features': [True, True, False]},
            )
            self.assertEqual(
                ohe_mock.call_args[1]['init_params'],
                {'categorical_features': [True, True, False]}
            )
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(
             0.96, sklearn.metrics.accuracy_score(predictions, Y_test))
         auto.predict_proba(X_test)
def get_models_performance(reproduce_num, data_set_idx):
    '''
    reproduce_num: the number of model choices for the dataset to reproduce
    data_set_idx: generated data set index, will load tried models for the dataset json file
    return: reproduced models performance json file
    '''
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    tried_models_filename = "./log/classifier_log" + str(
        data_set_idx) + "/tried_models_for_dataset" + str(
            data_set_idx) + ".json"
    models_performance = {}
    # duration = get_training_duration(data_set_idx)
    with open(tried_models_filename) as fp:
        models = json.load(fp)
        reproduce_num_act = min(len(models), reproduce_num)
        for i in range(1, reproduce_num_act + 1):
            model = models[str(i)]
            #print(model)
            train_accuracy_score = []
            test_accuracy_score = []
            train_log_loss = []
            test_log_loss = []
            #kf = KFold(n_splits=5, random_state=1, shuffle=True)
            time_start = time.time()
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.25, random_state=42, shuffle=True)

            p = SimpleClassificationPipeline(config=model)
            p.fit(X_train, y_train)
            #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
            #print(scores)
            y_train_pred = p.predict(X_train)
            y_test_pred = p.predict(X_test)
            train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
            test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
            train_log_loss.append(log_loss(y_train, y_train_pred))
            test_log_loss.append(log_loss(y_test, y_test_pred))
            time_end = time.time()
            duration = time_end - time_start
            models_performance[i] = {
                "train_accuracy_score": np.mean(train_accuracy_score),
                "test_accuracy_score": np.mean(test_accuracy_score),
                "train_log_loss": np.mean(train_log_loss),
                "test_log_loss": np.mean(test_log_loss),
                "duration": duration
            }
            #if i in duration:
            #    models_performance[i]["duration"] = duration[i]
    repreduce_performance_json_filename = "./log/classifier_log" + str(
        data_set_idx) + "/reproduce_models_performance" + str(
            data_set_idx) + ".json"
    with open(repreduce_performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance
Esempio n. 24
0
    def test_get_hyperparameter_search_space_dataset_properties(self):
        cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'multiclass': True})
        self.assertNotIn('bernoulli_nb', str(cs_mc))

        cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'multilabel': True})
        self.assertNotIn('k_nearest_neighbors', str(cs_ml))
        self.assertNotIn('liblinear', str(cs_ml))
        self.assertNotIn('libsvm_svc', str(cs_ml))
        self.assertNotIn('sgd', str(cs_ml))

        cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        self.assertIn('extra_trees', str(cs_sp))
        self.assertIn('gradient_boosting', str(cs_sp))
        self.assertIn('random_forest', str(cs_sp))

        cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={
                'multilabel': True,
                'multiclass': True
            })
        self.assertEqual(cs_ml, cs_mc_ml)
 def test_default_configuration_iterative_fit(self):
     classifier = SimpleClassificationPipeline(
         include={'classifier': ['random_forest'],
                  'feature_preprocessor': ['no_preprocessing']})
     X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
     classifier.fit_transformer(X_train, Y_train)
     for i in range(1, 11):
         classifier.iterative_fit(X_train, Y_train)
         self.assertEqual(classifier.steps[-1][-1].choice.estimator.n_estimators,
                          i)
Esempio n. 26
0
 def test_default_configuration(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Esempio n. 27
0
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(
            self):
        cs = SimpleClassificationPipeline(
            include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'qda')

        cs = SimpleClassificationPipeline(include={'preprocessor': ['nystroem_sampler']}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
Esempio n. 28
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 29
0
 def test_default_configuration_multilabel(self):
     for i in range(2):
         dataset_properties = {'multilabel': True}
         classifier = SimpleClassificationPipeline(
             dataset_properties=dataset_properties)
         cs = classifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris',
                                                        make_multilabel=True)
         classifier.set_hyperparameters(default)
         classifier = classifier.fit(X_train, Y_train)
         predictions = classifier.predict(X_test)
         self.assertAlmostEqual(0.94,
                                sklearn.metrics.accuracy_score(predictions,
                                                               Y_test))
         scores = classifier.predict_proba(X_test)
Esempio n. 30
0
    def test_get_hyperparameter_search_space(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()

        self.assertEqual(len(cs.get_hyperparameter("rescaling:__choice__").choices), 4)
        self.assertEqual(len(cs.get_hyperparameter("classifier:__choice__").choices), 17)
        self.assertEqual(len(cs.get_hyperparameter("preprocessor:__choice__").choices), 14)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(157, len(hyperparameters))

        # for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four parameters which are always active are classifier,
        # preprocessor, imputation strategy and scaling strategy
        self.assertEqual(len(hyperparameters) - 6, len(conditions))
Esempio n. 31
0
 def test_produce_zero_scaling(self):
     from autosklearn.pipeline.classification import SimpleClassificationPipeline
     from autosklearn.pipeline import util as putil
     p = SimpleClassificationPipeline(configuration={
         'balancing:strategy': 'weighting',
         'classifier:__choice__': 'qda',
         'classifier:qda:reg_param': 2.992955287687101,
         'imputation:strategy': 'most_frequent',
         'one_hot_encoding:use_minimum_fraction': 'False',
         'preprocessor:__choice__': 'gem',
         'preprocessor:gem:N': 18,
         'preprocessor:gem:precond': 0.12360249797270745,
         'rescaling:__choice__': 'none'})
     X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
     self.assertRaisesRegexp(ValueError, 'Numerical problems in '
                                         'QDA. QDA.scalings_ contains '
                                         'values <= 0.0',
                             p.fit, X_train, Y_train)
def get_default_characteristics(auto, X, y):
    metrics = {}
    best_model = get_best_model(auto)
    metrics["Dataset"] = get_dataset(auto)
    metrics["Best-Model"] = get_best_model_name(auto)
    metrics["Hyperparameters"] = get_sorted_params(auto)
    metrics["Selection_alg"] = get_hpo(auto)
    best_model = SimpleClassificationPipeline(best_model)
    if auto.ensemble_size != 0:
        metrics.update(
            prefix_dict_keys(
                "Ensemble",
                get_cross_validation_metrics(auto, X, y, DEFAULT_METRICS)))
    metrics.update(
        prefix_dict_keys(
            "Best_model",
            get_cross_validation_metrics(best_model, X, y, DEFAULT_METRICS)))
    return metrics
    def test_configurations_categorical_data(self):
        cs = SimpleClassificationPipeline(dataset_properties={
            'sparse': False
        },
                                          random_state=1,
                                          include={
                                              'feature_preprocessor':
                                              ['no_preprocessing'],
                                              'classifier':
                                              ['sgd', 'adaboost']
                                          }).get_hyperparameter_search_space()

        categorical = [
            True, True, True, False, False, True, True, True, False, True,
            True, True, True, True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, True, True, True, True,
            False, False, False, True, True, True
        ]
        categorical = {
            i: 'categorical' if bool_cat else 'numerical'
            for i, bool_cat in enumerate(categorical)
        }
        this_directory = os.path.dirname(__file__)
        X = np.loadtxt(
            os.path.join(this_directory, "components", "data_preprocessing",
                         "dataset.pkl"))
        y = X[:, -1].copy()
        X = X[:, :-1]
        X_train, X_test, Y_train, Y_test = \
            sklearn.model_selection.train_test_split(X, y)
        data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        }

        init_params = {'data_preprocessor:feat_type': categorical}

        self._test_configurations(configurations_space=cs,
                                  make_sparse=True,
                                  data=data,
                                  init_params=init_params)
def get_performance_of_range_encoded_models(data_set_idx, encoded_all_model_hyperparameters, json_model, verbose=False):
    """
    Get models performance (30 * 5) from encoded model choice matrix (30 * 38)
    """
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    models_performance = {}
    #get_performance_of_encoded_model([X, y], encoded_all_model_hyperparameters[0])
    for i in range(len(encoded_all_model_hyperparameters)):
        #model = models[str(i)]
        encoded_model = encoded_all_model_hyperparameters[i]
        model = decode_model(encoded_model)
        if verbose:
            print('Original json model: ', json_model[str(i+1)])
            print('Encoded model: ', encoded_model)
            print('Decoded model:' , model)
            print("==========================================================")
        train_accuracy_score = []
        test_accuracy_score = []
        train_log_loss = []
        test_log_loss = []
        #kf = KFold(n_splits=5, random_state=1)
        time_start = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)
        
        p = SimpleClassificationPipeline(config=model)
        p.fit(X_train, y_train)
        #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
        #print(scores)
        y_train_pred = p.predict(X_train)
        y_test_pred = p.predict(X_test)
        train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
        test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
        train_log_loss.append(log_loss(y_train, y_train_pred))
        test_log_loss.append(log_loss(y_test, y_test_pred))
        time_end = time.time()
        duration = time_end - time_start
        models_performance[i] = {"train_accuracy_score": np.mean(train_accuracy_score),
                         "test_accuracy_score": np.mean(test_accuracy_score),
                         "train_log_loss" : np.mean(train_log_loss),
                         "test_log_loss" : np.mean(test_log_loss),
                         "duration" : duration}
           
    performance_json_filename = "./log/classifier_log" + str(data_set_idx) + "/reproduce_models_performance" + str(data_set_idx) + ".json"
    with open(performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance
Esempio n. 35
0
    def test_configurations_categorical_data(self):
        cs = SimpleClassificationPipeline(dataset_properties={
            'sparse': False
        },
                                          include={
                                              'preprocessor':
                                              ['no_preprocessing'],
                                              'classifier':
                                              ['sgd', 'gradient_boosting']
                                          }).get_hyperparameter_search_space()

        categorical = [
            True, True, True, False, False, True, True, True, False, True,
            True, True, True, True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, True, True, True, True,
            False, False, False, True, True, True
        ]
        this_directory = os.path.dirname(__file__)
        X = np.loadtxt(
            os.path.join(this_directory, "components", "data_preprocessing",
                         "dataset.pkl"))
        y = X[:, -1].copy()
        X = X[:, :-1]
        X_train, X_test, Y_train, Y_test = \
            sklearn.model_selection.train_test_split(X, y)
        data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        }

        init_params = {
            'categorical_encoding:one_hot_encoding:categorical_features':
            categorical
        }

        self._test_configurations(configurations_space=cs,
                                  make_sparse=True,
                                  data=data,
                                  init_params=init_params)
    def test_predict_batched(self):
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)
Esempio n. 37
0
    def test_get_hyperparameter_search_space(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()

        self.assertEqual(
            len(cs.get_hyperparameter('rescaling:__choice__').choices), 4)
        self.assertEqual(
            len(cs.get_hyperparameter('classifier:__choice__').choices), 17)
        self.assertEqual(
            len(cs.get_hyperparameter('preprocessor:__choice__').choices), 14)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(157, len(hyperparameters))

        #for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four parameters which are always active are classifier,
        # preprocessor, imputation strategy and scaling strategy
        self.assertEqual(len(hyperparameters) - 6, len(conditions))
Esempio n. 38
0
 def test_fit_instantiates_component(self):
     """Make sure that if a preprocessor is added, it's fit
     method is called"""
     preprocessing_components.add_preprocessor(CrashPreprocessor)
     cls = SimpleClassificationPipeline()
     cs = cls.get_hyperparameter_search_space()
     self.assertIn('CrashPreprocessor', str(cs))
     config = cs.sample_configuration()
     config['feature_preprocessor:__choice__'] = 'CrashPreprocessor'
     cls.set_hyperparameters(config)
     with self.assertRaisesRegex(
         ValueError,
         "Make sure fit is called"
     ):
         cls.fit(
             X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]),
             y=np.array([1, 0, 1, 1])
         )
     del preprocessing_components._addons.components['CrashPreprocessor']
    def test_get_hyperparameter_search_space(self):
        cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()
        forbiddens = cs.get_forbiddens()

        self.assertEqual(len(cs.get_hyperparameter(
            'data_preprocessing:numerical_transformer:rescaling:__choice__').choices), 6)
        self.assertEqual(len(cs.get_hyperparameter(
            'classifier:__choice__').choices), 16)
        self.assertEqual(len(cs.get_hyperparameter(
            'feature_preprocessor:__choice__').choices), 13)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(167, len(hyperparameters))

        # for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four components which are always active are classifier,
        # feature preprocessor, balancing and data preprocessing pipeline.
        self.assertEqual(len(hyperparameters) - 7, len(conditions))

        self.assertEqual(len(forbiddens), 53)
    def test_configurations_sparse(self):
        cs = SimpleClassificationPipeline(dataset_properties={'sparse': True}).\
            get_hyperparameter_search_space()

        self._test_configurations(configurations_space=cs, make_sparse=True)
    def test_configurations(self):
        cs = SimpleClassificationPipeline().get_hyperparameter_search_space()

        self._test_configurations(configurations_space=cs)
 def test_repr(self):
     representation = repr(SimpleClassificationPipeline())
     cls = eval(representation)
     self.assertIsInstance(cls, SimpleClassificationPipeline)
Esempio n. 43
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 44
0
    def test_predict_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 45
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_set_hyperparameters_honors_configuration(self):
        """Makes sure that a given configuration is honored in practice.

        This method tests that the set hyperparameters actually create objects
        that comply with the given configuration. It iterates trough the pipeline to
        make sure we did not miss a step, but also checks at the end that every
        configuration from Config was checked
        """

        all_combinations = list(itertools.product([True, False], repeat=4))
        for sparse, multilabel, signed, multiclass, in all_combinations:
            dataset_properties = {
                'sparse': sparse,
                'multilabel': multilabel,
                'multiclass': multiclass,
                'signed': signed,
            }
            cls = SimpleClassificationPipeline(
                random_state=1,
                dataset_properties=dataset_properties,
            )
            cs = cls.get_hyperparameter_search_space()
            config = cs.sample_configuration()

            # Set hyperparameters takes a given config and translate
            # a config to an actual implementation
            cls.set_hyperparameters(config)
            config_dict = config.get_dictionary()

            # keys to check is our mechanism to ensure that every
            # every config key is checked
            keys_checked = []

            for name, step in cls.named_steps.items():
                if name == 'data_preprocessing':
                    # We have to check both the numerical and categorical
                    to_check = {
                        'numerical_transformer': step.numer_ppl.named_steps,
                        'categorical_transformer': step.categ_ppl.named_steps,
                    }

                    for data_type, pipeline in to_check.items():
                        for sub_name, sub_step in pipeline.items():
                            # If it is a Choice, make sure it is the correct one!
                            if isinstance(sub_step, AutoSklearnChoice):
                                key = "data_preprocessing:{}:{}:__choice__".format(
                                    data_type,
                                    sub_name
                                )
                                keys_checked.extend(
                                    self._test_set_hyperparameter_choice(
                                        key, sub_step, config_dict
                                    )
                                )
                            # If it is a component, make sure it has the correct hyperparams
                            elif isinstance(sub_step, AutoSklearnComponent):
                                keys_checked.extend(
                                    self._test_set_hyperparameter_component(
                                        "data_preprocessing:{}:{}".format(
                                            data_type,
                                            sub_name
                                        ),
                                        sub_step, config_dict
                                    )
                                )
                            else:
                                raise ValueError("New type of pipeline component!")
                elif name == 'balancing':
                    keys_checked.extend(
                        self._test_set_hyperparameter_component(
                            'balancing',
                            step, config_dict
                        )
                    )
                elif name == 'feature_preprocessor':
                    keys_checked.extend(
                        self._test_set_hyperparameter_choice(
                            'feature_preprocessor:__choice__', step, config_dict
                        )
                    )
                elif name == 'classifier':
                    keys_checked.extend(
                        self._test_set_hyperparameter_choice(
                            'classifier:__choice__', step, config_dict
                        )
                    )
                else:
                    raise ValueError("Found another type of step! Need to update this check")

            # Make sure we checked the whole configuration
            self.assertSetEqual(set(config_dict.keys()), set(keys_checked))
Esempio n. 47
0
        new_info_object['task'] = STRING_TO_TASK_TYPES[D.info['task']]
        new_info_object['metric'] = STRING_TO_METRIC[D.info['metric']]

        configuration_space = get_configuration_space(new_info_object)
        try:
            config = ConfigSpace.Configuration(configuration_space, configuration)
        except Exception as inst:
            execution_success = False
            logger.critical(inst)
            continue

        logger.info("Running the following configuration:")
        logger.info(str(config))

        if 'classifier:__choice__' in configuration:
            M = SimpleClassificationPipeline(config, 1)
        elif 'regressor:__choice__' in configuration:
            M = SimpleRegressionPipeline(config, 1)
        else:
            execution_success = False
            logger.critical('Invalid hyperparameter configuration, does neither '
                            'contain hyperparameter classifier:__choice__ nor '
                            'regressor:__choice__!')
            continue

        evaluate_model = pynisher.enforce_limits(
            mem_in_mb=memlimit, wall_time_in_s=overall_time_budget)(
        pynish_me_aka_evaluate_model)
        rval = evaluate_model(D, M)
        if rval is not None:
            if isinstance(rval, ValueError) and rval.message == "KernelPCA " \
Esempio n. 48
0
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=1000, n_features=20, n_redundant=5, n_informative=5,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.709, 0.658),
                 ('decision_tree', DecisionTree, 0.683, 0.701),
                 ('extra_trees', ExtraTreesClassifier, 0.812, 0.8),
                 ('gradient_boosting', GradientBoostingClassifier,
                    0.800, 0.760),
                 ('random_forest', RandomForest, 0.846, 0.792),
                 ('libsvm_svc', LibSVM_SVC, 0.571, 0.658),
                 ('liblinear_svc', LibLinear_SVC, 0.685, 0.699),
                 ('sgd', SGD, 0.65384615384615385, 0.38795986622073581)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.\
                    get_hyperparameter_search_space(
                        include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                    sklearn.metrics.f1_score(predictions, Y_test),
                    places=3)

                # pre_transform and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.7142857142857143,
                    0.72180451127819545),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.5934065934065933, 0.71111111111111114)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)

                # pre_transform and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)
Esempio n. 49
0
 def test_repr(self):
     cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
     default = cs.get_default_configuration()
     representation = repr(SimpleClassificationPipeline(default))
     cls = eval(representation)
     self.assertIsInstance(cls, SimpleClassificationPipeline)
Esempio n. 50
0
    def test_multilabel(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        dataset_properties = {"multilabel": True}
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties=dataset_properties)

        print(cs)
        cs.seed(5)

        for i in range(10):
            X, Y = sklearn.datasets.make_multilabel_classification(
                n_samples=150,
                n_features=20,
                n_classes=5,
                n_labels=2,
                length=50,
                allow_unlabeled=True,
                sparse=False,
                return_indicator=True,
                return_distributions=False,
                random_state=1,
            )
            X_train = X[:100, :]
            Y_train = Y[:100, :]
            X_test = X[101:, :]
            Y_test = Y[101:,]

            config = cs.sample_configuration()

            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabilities = cls.predict_proba(X_test_)
                [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities]
            except np.linalg.LinAlgError:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Esempio n. 51
0
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": "True",
                "preprocessor:__choice__": "no_preprocessing",
                "classifier:random_forest:bootstrap": "True",
                "classifier:random_forest:criterion": "gini",
                "classifier:random_forest:max_depth": "None",
                "classifier:random_forest:min_samples_split": 2,
                "classifier:random_forest:min_samples_leaf": 2,
                "classifier:random_forest:min_weight_fraction_leaf": 0.0,
                "classifier:random_forest:max_features": 0.5,
                "classifier:random_forest:max_leaf_nodes": "None",
                "classifier:random_forest:n_estimators": 100,
                "rescaling:__choice__": "min/max",
            },
        )

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 52
0
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            print(config)
            categorical = [
                True,
                True,
                True,
                False,
                False,
                True,
                True,
                True,
                False,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                False,
                False,
                False,
                True,
                True,
                True,
            ]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:, :-1]
            X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, y)

            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train, init_params={"one_hot_encoding:categorical_features": categorical})
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def _test_configurations(self, configurations_space, make_sparse=False,
                             data=None, init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        print(configurations_space)

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {'classifier:passive_aggressive:n_iter': 5,
                            'classifier:sgd:n_iter': 5,
                            'classifier:adaboost:n_estimators': 50,
                            'classifier:adaboost:max_depth': 1,
                            'feature_preprocessor:kernel_pca:n_components': 10,
                            'feature_preprocessor:kitchen_sinks:n_components': 50,
                            'classifier:proj_logit:max_epochs': 1,
                            'classifier:libsvm_svc:degree': 2,
                            'regressor:libsvm_svr:degree': 2,
                            'feature_preprocessor:truncatedSVD:target_dim': 10,
                            'feature_preprocessor:polynomial:degree': 2,
                            'classifier:lda:n_components': 10,
                            'feature_preprocessor:nystroem_sampler:n_components': 50,
                            'feature_preprocessor:feature_agglomeration:n_clusters': 2,
                            'classifier:gradient_boosting:max_leaf_nodes': 64}

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            print(config)

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(random_state=1,
                                               dataset_properties=dataset_properties,
                                               init_params=init_params_,)
            cls.set_hyperparameters(config, init_params=init_params_)

            # First make sure that for this configuration, setting the parameters
            # does not mistakenly set the estimator as fitted
            for name, step in cls.named_steps.items():
                with self.assertRaisesRegex(sklearn.exceptions.NotFittedError,
                                            "instance is not fitted yet"):
                    check_is_fitted(step)

            try:
                cls.fit(X_train, Y_train)

                # After fit, all components should be tagged as fitted
                # by sklearn. Check is fitted raises an exception if that
                # is not the case
                try:
                    for name, step in cls.named_steps.items():
                        check_is_fitted(step)
                except sklearn.exceptions.NotFittedError:
                    self.fail("config={} raised NotFittedError unexpectedly!".format(
                        config
                    ))

                cls.predict(X_test.copy())
                cls.predict_proba(X_test)
            except MemoryError:
                continue
            except np.linalg.LinAlgError:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                     'values.' in e.args[0]:
                    continue
                elif 'Internal work array size computation failed' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "invalid value encountered in multiply" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                elif "invalid value encountered in multiply" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
Esempio n. 54
0
    def _test_configurations(self,
                             configurations_space,
                             make_sparse=False,
                             data=None,
                             init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {
                'classifier:passive_aggressive:n_iter': 5,
                'classifier:sgd:n_iter': 5,
                'classifier:adaboost:n_estimators': 50,
                'classifier:adaboost:max_depth': 1,
                'preprocessor:kernel_pca:n_components': 10,
                'preprocessor:kitchen_sinks:n_components': 50,
                'classifier:proj_logit:max_epochs': 1,
                'classifier:libsvm_svc:degree': 2,
                'regressor:libsvm_svr:degree': 2,
                'preprocessor:truncatedSVD:target_dim': 10,
                'preprocessor:polynomial:degree': 2,
                'classifier:lda:n_components': 10,
                'preprocessor:nystroem_sampler:n_components': 50,
                'preprocessor:feature_agglomeration:n_clusters': 2
            }

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                Y_test = data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(
                random_state=1,
                dataset_properties=dataset_properties,
                init_params=init_params_)
            cls.set_hyperparameters(config)
            try:
                cls.fit(
                    X_train,
                    Y_train,
                )
                predictions = cls.predict(X_test)
            except MemoryError as e:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                        'values.' in e.args[0]:
                    continue
                elif 'which is larger than the original space with n_features='\
                        in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(cs,
                               values={"balancing:strategy": "none",
                                       "classifier:__choice__": "random_forest",
                                       "imputation:strategy": "mean",
                                       "one_hot_encoding:minimum_fraction": 0.01,
                                       "one_hot_encoding:use_minimum_fraction": 'True',
                                       "preprocessor:__choice__": "no_preprocessing",
                                       'classifier:random_forest:bootstrap': 'True',
                                       'classifier:random_forest:criterion': 'gini',
                                       'classifier:random_forest:max_depth': 'None',
                                       'classifier:random_forest:min_samples_split': 2,
                                       'classifier:random_forest:min_samples_leaf': 2,
                                       'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                                       'classifier:random_forest:max_features': 0.5,
                                       'classifier:random_forest:max_leaf_nodes': 'None',
                                       'classifier:random_forest:n_estimators': 100,
                                       "rescaling:__choice__": "min/max"})

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train_ = np.zeros((Y_train.shape[0], 10))
        for i, y in enumerate(Y_train):
            Y_train_[i][y] = 1
        Y_train = Y_train_
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched_sparse(self):

        cls = SimpleClassificationPipeline(
            dataset_properties={'sparse': True, 'multiclass': True},
            include={'classifier': ['sgd']})

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(
            dataset_properties={'sparse': True, 'multilabel': True},
            include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)
Esempio n. 57
0
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=200, n_features=10, n_redundant=2, n_informative=2,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting, places in \
                [('adaboost', AdaboostClassifier, 0.810, 0.735, 3),
                 ('decision_tree', DecisionTree, 0.780, 0.643, 3),
                 ('extra_trees', ExtraTreesClassifier, 0.780, 0.8, 3),
                 ('gradient_boosting', GradientBoostingClassifier,
                  0.737, 0.684, 3),
                 ('random_forest', RandomForest, 0.780, 0.789, 3),
                 ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3),
                 ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3),
                 ('passive_aggressive', PassiveAggressive, 0.642, 0.449, 3),
                 ('sgd', SGD, 0.818, 0.575, 2)
                ]:
            for strategy, acc in [
                ('none', acc_no_weighting),
                ('weighting', acc_weighting)
            ]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': [name],
                           'preprocessor': ['no_preprocessing']}
                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=places, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                classifier.set_hyperparameters(configuration=default)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=places)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.810, 0.563),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.837, 0.567)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': ['sgd'], 'preprocessor': [name]}

                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier.set_hyperparameters(default)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)
Esempio n. 58
0
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=200, n_features=10, n_redundant=2, n_informative=2,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.810, 0.735),
                 ('decision_tree', DecisionTree, 0.780, 0.643),
                 ('extra_trees', ExtraTreesClassifier, 0.75, 0.800),
                 ('gradient_boosting', GradientBoostingClassifier,
                  0.789, 0.762),
                 ('random_forest', RandomForest, 0.75, 0.821),
                 ('libsvm_svc', LibSVM_SVC, 0.769, 0.72),
                 ('liblinear_svc', LibLinear_SVC, 0.762, 0.735),
                 ('sgd', SGD, 0.704, 0.667)
                ]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': [name],
                           'preprocessor': ['no_preprocessing']}
                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                classifier.set_hyperparameters(configuration=default)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.691, 0.692),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.692, 0.590)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': ['sgd'], 'preprocessor': [name]}

                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier.set_hyperparameters(default)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)
Esempio n. 59
0
    def _test_configurations(self, configurations_space, make_sparse=False,
                             data=None, init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        print(configurations_space)

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {'classifier:passive_aggressive:n_iter': 5,
                            'classifier:sgd:n_iter': 5,
                            'classifier:adaboost:n_estimators': 50,
                            'classifier:adaboost:max_depth': 1,
                            'preprocessor:kernel_pca:n_components': 10,
                            'preprocessor:kitchen_sinks:n_components': 50,
                            'classifier:proj_logit:max_epochs': 1,
                            'classifier:libsvm_svc:degree': 2,
                            'regressor:libsvm_svr:degree': 2,
                            'preprocessor:truncatedSVD:target_dim': 10,
                            'preprocessor:polynomial:degree': 2,
                            'classifier:lda:n_components': 10,
                            'preprocessor:nystroem_sampler:n_components': 50,
                            'preprocessor:feature_agglomeration:n_clusters': 2,
                            'classifier:gradient_boosting:max_depth': 2,
                            'classifier:gradient_boosting:n_estimators': 50}

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            print(config)

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                Y_test = data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(random_state=1,
                                               dataset_properties=dataset_properties,
                                               init_params=init_params_,)
            cls.set_hyperparameters(config, init_params=init_params_)
            try:
                cls.fit(X_train, Y_train, )
                predictions = cls.predict(X_test.copy())
                predictions = cls.predict_proba(X_test)
            except MemoryError as e:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                     'values.' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e