Exemple #1
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'classifier': ['decision_tree']})
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #2
0
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'classifier': ['decision_tree']},
            dataset_properties={'sparse': True})
        config = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #3
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
    def test_predict_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
def get_models_performance_by_data(input):
    X = input[0]
    y = input[1]
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    model = random_model()
    train_accuracy_score = []
    test_accuracy_score = []
    train_log_loss = []
    test_log_loss = []
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    time_start = time.time()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        p = SimpleClassificationPipeline(config=model)
        p.fit(X_train, y_train)
        y_train_pred = p.predict(X_train)
        y_test_pred = p.predict(X_test)
        train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
        test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
        train_log_loss.append(log_loss(y_train, y_train_pred))
        test_log_loss.append(log_loss(y_test, y_test_pred))
    time_end = time.time()
    duration = time_end - time_start
    models_performance = {
        "train_accuracy_score": np.mean(train_accuracy_score),
        "test_accuracy_score": np.mean(test_accuracy_score),
        "train_log_loss": np.mean(train_log_loss),
        "test_log_loss": np.mean(test_log_loss),
        "duration": duration / 5
    }
    return models_performance
Exemple #8
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
def get_performance_of_encoded_model(data_set, encoded_model, verbose=False):
    """
    Get model performance array(4 * 1) from encoded model vector(17 * 1)
    data_set : (X, y) input dataset to get performance
    encoded_model : encoded model choice vector (17 * 1)
    verbose : if True, will log model choice dictionary and model performance array
    return : model performance vector(4 * 1)
    """
    train_accuracy_score = []
    test_accuracy_score = []
    train_log_loss = []
    test_log_loss = []
    X, y = data_set
    #kf = KFold(n_splits=5, random_state=1, shuffle=True)
    model = decode_model(encoded_model)
    if verbose:
        print('Model choice: {0}'.format(model))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        
    p = SimpleClassificationPipeline(config=model)
    p.fit(X_train, y_train)
    #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
    #print(scores)
    y_train_pred = p.predict(X_train)
    y_test_pred = p.predict(X_test)
    train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
    test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
    train_log_loss.append(log_loss(y_train, y_train_pred))
    test_log_loss.append(log_loss(y_test, y_test_pred))
    model_performance = np.array([np.mean(train_accuracy_score), np.mean(test_accuracy_score), np.mean(train_log_loss), np.mean(test_log_loss)])
    if verbose:
        print('Model Performance: {o}'.format(model_performance))
    return model_performance
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train_ = np.zeros((Y_train.shape[0], 10))
        for i, y in enumerate(Y_train):
            Y_train_[i][y] = 1
        Y_train = Y_train_
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #11
0
    def test_predict_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched(self):
        # Multiclass
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')

        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched(self):
        # Multiclass
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')

        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #14
0
    def test_fit_instantiates_component(self):
        """Make sure that if a preprocessor is added, it's fit
        method is called"""
        preprocessing_components.add_preprocessor(CrashPreprocessor)

        # We reduce the search space as forbidden clauses prevent to instantiate
        # the user defined preprocessor manually
        cls = SimpleClassificationPipeline(
            include={'classifier': ['random_forest']})
        cs = cls.get_hyperparameter_search_space()
        self.assertIn('CrashPreprocessor', str(cs))
        config = cs.sample_configuration()
        try:
            config['feature_preprocessor:__choice__'] = 'CrashPreprocessor'
        except Exception as e:
            # In case of failure clean up the components and print enough information
            # to clean up with check in the future
            del preprocessing_components._addons.components[
                'CrashPreprocessor']
            self.fail("cs={} config={} Exception={}".format(cs, config, e))
        cls.set_hyperparameters(config)
        with self.assertRaisesRegex(ValueError, "Make sure fit is called"):
            cls.fit(X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]),
                    y=np.array([1, 0, 1, 1]))
        del preprocessing_components._addons.components['CrashPreprocessor']
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": 'True',
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                "rescaling:__choice__": "min/max"
            })

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
def get_models_performance(reproduce_num, data_set_idx):
    '''
    reproduce_num: the number of model choices for the dataset to reproduce
    data_set_idx: generated data set index, will load tried models for the dataset json file
    return: reproduced models performance json file
    '''
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    tried_models_filename = "./log/classifier_log" + str(
        data_set_idx) + "/tried_models_for_dataset" + str(
            data_set_idx) + ".json"
    models_performance = {}
    # duration = get_training_duration(data_set_idx)
    with open(tried_models_filename) as fp:
        models = json.load(fp)
        reproduce_num_act = min(len(models), reproduce_num)
        for i in range(1, reproduce_num_act + 1):
            model = models[str(i)]
            #print(model)
            train_accuracy_score = []
            test_accuracy_score = []
            train_log_loss = []
            test_log_loss = []
            #kf = KFold(n_splits=5, random_state=1, shuffle=True)
            time_start = time.time()
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.25, random_state=42, shuffle=True)

            p = SimpleClassificationPipeline(config=model)
            p.fit(X_train, y_train)
            #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
            #print(scores)
            y_train_pred = p.predict(X_train)
            y_test_pred = p.predict(X_test)
            train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
            test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
            train_log_loss.append(log_loss(y_train, y_train_pred))
            test_log_loss.append(log_loss(y_test, y_test_pred))
            time_end = time.time()
            duration = time_end - time_start
            models_performance[i] = {
                "train_accuracy_score": np.mean(train_accuracy_score),
                "test_accuracy_score": np.mean(test_accuracy_score),
                "train_log_loss": np.mean(train_log_loss),
                "test_log_loss": np.mean(test_log_loss),
                "duration": duration
            }
            #if i in duration:
            #    models_performance[i]["duration"] = duration[i]
    repreduce_performance_json_filename = "./log/classifier_log" + str(
        data_set_idx) + "/reproduce_models_performance" + str(
            data_set_idx) + ".json"
    with open(repreduce_performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(cs,
                               values={"balancing:strategy": "none",
                                       "classifier:__choice__": "random_forest",
                                       "imputation:strategy": "mean",
                                       "one_hot_encoding:minimum_fraction": 0.01,
                                       "one_hot_encoding:use_minimum_fraction": 'True',
                                       "preprocessor:__choice__": "no_preprocessing",
                                       'classifier:random_forest:bootstrap': 'True',
                                       'classifier:random_forest:criterion': 'gini',
                                       'classifier:random_forest:max_depth': 'None',
                                       'classifier:random_forest:min_samples_split': 2,
                                       'classifier:random_forest:min_samples_leaf': 2,
                                       'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                                       'classifier:random_forest:max_features': 0.5,
                                       'classifier:random_forest:max_leaf_nodes': 'None',
                                       'classifier:random_forest:n_estimators': 100,
                                       "rescaling:__choice__": "min/max"})

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train_ = np.zeros((Y_train.shape[0], 10))
        for i, y in enumerate(Y_train):
            Y_train_[i][y] = 1
        Y_train = Y_train_
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #18
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": "True",
                "preprocessor:__choice__": "no_preprocessing",
                "classifier:random_forest:bootstrap": "True",
                "classifier:random_forest:criterion": "gini",
                "classifier:random_forest:max_depth": "None",
                "classifier:random_forest:min_samples_split": 2,
                "classifier:random_forest:min_samples_leaf": 2,
                "classifier:random_forest:min_weight_fraction_leaf": 0.0,
                "classifier:random_forest:max_features": 0.5,
                "classifier:random_forest:max_leaf_nodes": "None",
                "classifier:random_forest:n_estimators": 100,
                "rescaling:__choice__": "min/max",
            },
        )

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Exemple #20
0
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": "True",
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                "rescaling:__choice__": "min/max"
            })
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
def get_performance_of_range_encoded_models(data_set_idx, encoded_all_model_hyperparameters, json_model, verbose=False):
    """
    Get models performance (30 * 5) from encoded model choice matrix (30 * 38)
    """
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    models_performance = {}
    #get_performance_of_encoded_model([X, y], encoded_all_model_hyperparameters[0])
    for i in range(len(encoded_all_model_hyperparameters)):
        #model = models[str(i)]
        encoded_model = encoded_all_model_hyperparameters[i]
        model = decode_model(encoded_model)
        if verbose:
            print('Original json model: ', json_model[str(i+1)])
            print('Encoded model: ', encoded_model)
            print('Decoded model:' , model)
            print("==========================================================")
        train_accuracy_score = []
        test_accuracy_score = []
        train_log_loss = []
        test_log_loss = []
        #kf = KFold(n_splits=5, random_state=1)
        time_start = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)
        
        p = SimpleClassificationPipeline(config=model)
        p.fit(X_train, y_train)
        #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
        #print(scores)
        y_train_pred = p.predict(X_train)
        y_test_pred = p.predict(X_test)
        train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
        test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
        train_log_loss.append(log_loss(y_train, y_train_pred))
        test_log_loss.append(log_loss(y_test, y_test_pred))
        time_end = time.time()
        duration = time_end - time_start
        models_performance[i] = {"train_accuracy_score": np.mean(train_accuracy_score),
                         "test_accuracy_score": np.mean(test_accuracy_score),
                         "train_log_loss" : np.mean(train_log_loss),
                         "test_log_loss" : np.mean(test_log_loss),
                         "duration" : duration}
           
    performance_json_filename = "./log/classifier_log" + str(data_set_idx) + "/reproduce_models_performance" + str(data_set_idx) + ".json"
    with open(performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance
    def test_predict_batched(self):
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        config = Configuration(cs,
            values={"balancing:strategy": "none",
                    "classifier:__choice__": "random_forest",
                    "imputation:strategy": "mean",
                    "one_hot_encoding:minimum_fraction": 0.01,
                    "one_hot_encoding:use_minimum_fraction": "True",
                    "preprocessor:__choice__": "no_preprocessing",
                    'classifier:random_forest:bootstrap': 'True',
                    'classifier:random_forest:criterion': 'gini',
                    'classifier:random_forest:max_depth': 'None',
                    'classifier:random_forest:min_samples_split': 2,
                    'classifier:random_forest:min_samples_leaf': 2,
                    'classifier:random_forest:max_features': 0.5,
                    'classifier:random_forest:max_leaf_nodes': 'None',
                    'classifier:random_forest:n_estimators': 100,
                    'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                    "rescaling:__choice__": "min/max"})
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_batched_sparse(self):
        cls = SimpleClassificationPipeline(dataset_properties={'sparse': True},
                                           include={'classifier': ['sgd']})

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.96, sklearn.metrics.accuracy_score(predictions, Y_test))
         auto.predict_proba(X_test)
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.94,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Exemple #27
0
 def test_fit_instantiates_component(self):
     """Make sure that if a preprocessor is added, it's fit
     method is called"""
     preprocessing_components.add_preprocessor(CrashPreprocessor)
     cls = SimpleClassificationPipeline()
     cs = cls.get_hyperparameter_search_space()
     self.assertIn('CrashPreprocessor', str(cs))
     config = cs.sample_configuration()
     config['feature_preprocessor:__choice__'] = 'CrashPreprocessor'
     cls.set_hyperparameters(config)
     with self.assertRaisesRegex(
         ValueError,
         "Make sure fit is called"
     ):
         cls.fit(
             X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]),
             y=np.array([1, 0, 1, 1])
         )
     del preprocessing_components._addons.components['CrashPreprocessor']
 def test_default_configuration_multilabel(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True})
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris", make_multilabel=True)
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Exemple #29
0
 def test_default_configuration(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
 def test_default_configuration_multilabel(self):
     for i in range(2):
         classifier = SimpleClassificationPipeline(
             random_state=1, dataset_properties={'multilabel': True})
         cs = classifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(
             dataset='iris', make_multilabel=True)
         classifier.set_hyperparameters(default)
         classifier = classifier.fit(X_train, Y_train)
         predictions = classifier.predict(X_test)
         self.assertAlmostEqual(
             0.96, sklearn.metrics.accuracy_score(predictions, Y_test))
         classifier.predict_proba(X_test)
Exemple #31
0
def max_estimators_fit_duration(X,y,max_classifier_time_budget,sample_factor=1):
    p("constructing preprocessor pipeline and transforming sample dataset")
    # we don't care about the data here but need to preprocess, otherwise the classifiers crash
    default_cs = SimpleClassificationPipeline(
                                        ).get_hyperparameter_search_space(
#                            include={ 'imputation': 'most_frequent'
#                                        , 'rescaling': 'standardize' }
                                        ).get_default_configuration()
    preprocessor = SimpleClassificationPipeline(default_cs, random_state=42)
    preprocessor.fit(X,y)
    X_tr,dummy = preprocessor.pre_transform(X,y)

    p("running estimators on a subset")
    # going over all default classifiers used by auto-sklearn
    clfs=autosklearn.pipeline.components.classification._classifiers

    processes = []
    with multiprocessing.Manager() as manager:
        max_clf_time=manager.Value('i',3) # default 3 sec
        for clf_name,clf_class in clfs.items() :
            pr = multiprocessing.Process( target=time_single_estimator, name=clf_name
                    , args=(clf_name, clf_class, X_tr, y, max_clf_time))
            pr.start()
            processes.append(pr)
        for pr in processes:
            pr.join(max_classifier_time_budget) # will block for max_classifier_time_budget or
            # until the classifier fit process finishes. After max_classifier_time_budget 
            # we will terminate all still running processes here. 
            if pr.is_alive():
                p("terminating "+pr.name+" process due to timeout")    
                pr.terminate()
        result_max_clf_time=max_clf_time.value

    p("test classifier fit completed")
    
    per_run_time_limit = int(sample_factor*result_max_clf_time) 
    return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
 def test_default_configuration_multilabel(self):
     for i in range(2):
         dataset_properties = {'multilabel': True}
         classifier = SimpleClassificationPipeline(
             dataset_properties=dataset_properties)
         cs = classifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris',
                                                        make_multilabel=True)
         classifier.set_hyperparameters(default)
         classifier = classifier.fit(X_train, Y_train)
         predictions = classifier.predict(X_test)
         self.assertAlmostEqual(0.94,
                                sklearn.metrics.accuracy_score(predictions,
                                                               Y_test))
         scores = classifier.predict_proba(X_test)
    def test_pipeline_clonability(self):
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
        auto = SimpleClassificationPipeline()
        auto = auto.fit(X_train, Y_train)
        auto_clone = clone(auto)
        auto_clone_params = auto_clone.get_params()

        # Make sure all keys are copied properly
        for k, v in auto.get_params().items():
            self.assertIn(k, auto_clone_params)

        # Make sure the params getter of estimator are honored
        klass = auto.__class__
        new_object_params = auto.get_params(deep=False)
        for name, param in new_object_params.items():
            new_object_params[name] = clone(param, safe=False)
        new_object = klass(**new_object_params)
        params_set = new_object.get_params(deep=False)

        for name in new_object_params:
            param1 = new_object_params[name]
            param2 = params_set[name]
            self.assertEqual(param1, param2)
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            categorical = [True, True, True, False, False, True, True, True,
                           False, True, True, True, True, True, True, True,
                           True, True, True, True, True, True, True, True, True,
                           True, True, True, True, True, True, True, False,
                           False, False, True, True, True]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(os.path.join(this_directory, "components",
                                        "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:,:-1]
            X_train, X_test, Y_train, Y_test = \
                sklearn.cross_validation.train_test_split(X, y)

            cls = SimpleClassificationPipeline(config, random_state=1,)
            try:
                cls.fit(X_train, Y_train,
                        init_params={'one_hot_encoding:categorical_features': categorical})
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                    e.args[0] or \
                    "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def _test_configurations(self, configurations_space, make_sparse=False,
                             data=None, init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        print(configurations_space)

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {'classifier:passive_aggressive:n_iter': 5,
                            'classifier:sgd:n_iter': 5,
                            'classifier:adaboost:n_estimators': 50,
                            'classifier:adaboost:max_depth': 1,
                            'feature_preprocessor:kernel_pca:n_components': 10,
                            'feature_preprocessor:kitchen_sinks:n_components': 50,
                            'classifier:proj_logit:max_epochs': 1,
                            'classifier:libsvm_svc:degree': 2,
                            'regressor:libsvm_svr:degree': 2,
                            'feature_preprocessor:truncatedSVD:target_dim': 10,
                            'feature_preprocessor:polynomial:degree': 2,
                            'classifier:lda:n_components': 10,
                            'feature_preprocessor:nystroem_sampler:n_components': 50,
                            'feature_preprocessor:feature_agglomeration:n_clusters': 2,
                            'classifier:gradient_boosting:max_leaf_nodes': 64}

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            print(config)

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(random_state=1,
                                               dataset_properties=dataset_properties,
                                               init_params=init_params_,)
            cls.set_hyperparameters(config, init_params=init_params_)

            # First make sure that for this configuration, setting the parameters
            # does not mistakenly set the estimator as fitted
            for name, step in cls.named_steps.items():
                with self.assertRaisesRegex(sklearn.exceptions.NotFittedError,
                                            "instance is not fitted yet"):
                    check_is_fitted(step)

            try:
                cls.fit(X_train, Y_train)

                # After fit, all components should be tagged as fitted
                # by sklearn. Check is fitted raises an exception if that
                # is not the case
                try:
                    for name, step in cls.named_steps.items():
                        check_is_fitted(step)
                except sklearn.exceptions.NotFittedError:
                    self.fail("config={} raised NotFittedError unexpectedly!".format(
                        config
                    ))

                cls.predict(X_test.copy())
                cls.predict_proba(X_test)
            except MemoryError:
                continue
            except np.linalg.LinAlgError:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                     'values.' in e.args[0]:
                    continue
                elif 'Internal work array size computation failed' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "invalid value encountered in multiply" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                elif "invalid value encountered in multiply" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
    def test_multilabel(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        dataset_properties = {"multilabel": True}
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties=dataset_properties)

        print(cs)
        cs.seed(5)

        for i in range(10):
            X, Y = sklearn.datasets.make_multilabel_classification(
                n_samples=150,
                n_features=20,
                n_classes=5,
                n_labels=2,
                length=50,
                allow_unlabeled=True,
                sparse=False,
                return_indicator=True,
                return_distributions=False,
                random_state=1,
            )
            X_train = X[:100, :]
            Y_train = Y[:100, :]
            X_test = X[101:, :]
            Y_test = Y[101:,]

            config = cs.sample_configuration()

            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabilities = cls.predict_proba(X_test_)
                [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities]
            except np.linalg.LinAlgError:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Exemple #37
0
    def _test_configurations(self,
                             configurations_space,
                             make_sparse=False,
                             data=None,
                             init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {
                'classifier:passive_aggressive:n_iter': 5,
                'classifier:sgd:n_iter': 5,
                'classifier:adaboost:n_estimators': 50,
                'classifier:adaboost:max_depth': 1,
                'preprocessor:kernel_pca:n_components': 10,
                'preprocessor:kitchen_sinks:n_components': 50,
                'classifier:proj_logit:max_epochs': 1,
                'classifier:libsvm_svc:degree': 2,
                'regressor:libsvm_svr:degree': 2,
                'preprocessor:truncatedSVD:target_dim': 10,
                'preprocessor:polynomial:degree': 2,
                'classifier:lda:n_components': 10,
                'preprocessor:nystroem_sampler:n_components': 50,
                'preprocessor:feature_agglomeration:n_clusters': 2
            }

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                Y_test = data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(
                random_state=1,
                dataset_properties=dataset_properties,
                init_params=init_params_)
            cls.set_hyperparameters(config)
            try:
                cls.fit(
                    X_train,
                    Y_train,
                )
                predictions = cls.predict(X_test)
            except MemoryError as e:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                        'values.' in e.args[0]:
                    continue
                elif 'which is larger than the original space with n_features='\
                        in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=1000, n_features=20, n_redundant=5, n_informative=5,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.709, 0.658),
                 ('decision_tree', DecisionTree, 0.683, 0.701),
                 ('extra_trees', ExtraTreesClassifier, 0.812, 0.8),
                 ('gradient_boosting', GradientBoostingClassifier,
                    0.800, 0.760),
                 ('random_forest', RandomForest, 0.846, 0.792),
                 ('libsvm_svc', LibSVM_SVC, 0.571, 0.658),
                 ('liblinear_svc', LibLinear_SVC, 0.685, 0.699),
                 ('sgd', SGD, 0.65384615384615385, 0.38795986622073581)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.\
                    get_hyperparameter_search_space(
                        include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                    sklearn.metrics.f1_score(predictions, Y_test),
                    places=3)

                # pre_transform and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.7142857142857143,
                    0.72180451127819545),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.5934065934065933, 0.71111111111111114)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)

                # pre_transform and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:700]
                Y_train = data_[1][:700]
                X_test = data_[0][700:]
                Y_test = data_[1][700:]

                cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
                    include={'classifier': ['sgd'], 'preprocessor': [name]})
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(default, random_state=1)
                Xt, fit_params = classifier.pre_transform(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(acc,
                                       sklearn.metrics.f1_score(
                                           predictions, Y_test),
                                       places=3)
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if (
                "classifier:passive_aggressive:n_iter" in config
                and config["classifier:passive_aggressive:n_iter"] is not None
            ):
                config._values["classifier:passive_aggressive:n_iter"] = 5
            if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None:
                config._values["classifier:sgd:n_iter"] = 5
            if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None:
                config._values["classifier:adaboost:n_estimators"] = 50
            if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None:
                config._values["classifier:adaboost:max_depth"] = 1

            print(config)
            categorical = [
                True,
                True,
                True,
                False,
                False,
                True,
                True,
                True,
                False,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                True,
                False,
                False,
                False,
                True,
                True,
                True,
            ]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:, :-1]
            X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, y)

            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train, init_params={"one_hot_encoding:categorical_features": categorical})
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif "Bug in scikit-learn" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def _test_configurations(self, configurations_space, make_sparse=False,
                             data=None, init_params=None,
                             dataset_properties=None):
        # Use a limit of ~3GiB
        limit = 3072 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        print(configurations_space)

        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {'classifier:passive_aggressive:n_iter': 5,
                            'classifier:sgd:n_iter': 5,
                            'classifier:adaboost:n_estimators': 50,
                            'classifier:adaboost:max_depth': 1,
                            'preprocessor:kernel_pca:n_components': 10,
                            'preprocessor:kitchen_sinks:n_components': 50,
                            'classifier:proj_logit:max_epochs': 1,
                            'classifier:libsvm_svc:degree': 2,
                            'regressor:libsvm_svr:degree': 2,
                            'preprocessor:truncatedSVD:target_dim': 10,
                            'preprocessor:polynomial:degree': 2,
                            'classifier:lda:n_components': 10,
                            'preprocessor:nystroem_sampler:n_components': 50,
                            'preprocessor:feature_agglomeration:n_clusters': 2,
                            'classifier:gradient_boosting:max_depth': 2,
                            'classifier:gradient_boosting:n_estimators': 50}

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            print(config)

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse, add_NaNs=True)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                Y_test = data['Y_test'].copy()

            init_params_ = copy.deepcopy(init_params)
            cls = SimpleClassificationPipeline(random_state=1,
                                               dataset_properties=dataset_properties,
                                               init_params=init_params_,)
            cls.set_hyperparameters(config, init_params=init_params_)
            try:
                cls.fit(X_train, Y_train, )
                predictions = cls.predict(X_test.copy())
                predictions = cls.predict_proba(X_test)
            except MemoryError as e:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                elif 'The condensed distance matrix must contain only finite ' \
                     'values.' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(traceback.format_exc())
                    print(config)
                    raise e
    def test_configurations_categorical_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5
            if 'classifier:adaboost:n_estimators' in config and \
                    config['classifier:adaboost:n_estimators'] is not None:
                config._values['classifier:adaboost:n_estimators'] = 50
            if 'classifier:adaboost:max_depth' in config and \
                            config['classifier:adaboost:max_depth'] is not None:
                config._values['classifier:adaboost:max_depth'] = 1

            print(config)
            categorical = [
                True, True, True, False, False, True, True, True, False, True,
                True, True, True, True, True, True, True, True, True, True,
                True, True, True, True, True, True, True, True, True, True,
                True, True, False, False, False, True, True, True
            ]
            this_directory = os.path.dirname(__file__)
            X = np.loadtxt(
                os.path.join(this_directory, "components",
                             "data_preprocessing", "dataset.pkl"))
            y = X[:, -1].copy()
            X = X[:, :-1]
            X_train, X_test, Y_train, Y_test = \
                sklearn.cross_validation.train_test_split(X, y)

            cls = SimpleClassificationPipeline(
                config,
                random_state=1,
            )
            try:
                cls.fit(X_train,
                        Y_train,
                        init_params={
                            'one_hot_encoding:categorical_features':
                            categorical
                        })
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
    def test_multilabel(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        dataset_properties = {'multilabel': True}
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties=dataset_properties)

        print(cs)
        cs.seed(5)

        for i in range(10):
            X, Y = sklearn.datasets.\
                    make_multilabel_classification(n_samples=150,
                                                   n_features=20,
                                                   n_classes=5,
                                                   n_labels=2,
                                                   length=50,
                                                   allow_unlabeled=True,
                                                   sparse=False,
                                                   return_indicator=True,
                                                   return_distributions=False,
                                                   random_state=1)
            X_train = X[:100, :]
            Y_train = Y[:100, :]
            X_test = X[101:, :]
            Y_test = Y[101:, ]

            config = cs.sample_configuration()

            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5
            if 'classifier:adaboost:n_estimators' in config and \
                    config['classifier:adaboost:n_estimators'] is not None:
                config._values['classifier:adaboost:n_estimators'] = 50
            if 'classifier:adaboost:max_depth' in config and \
                    config['classifier:adaboost:max_depth'] is not None:
                config._values['classifier:adaboost:max_depth'] = 1

            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabilities = cls.predict_proba(X_test_)
                [
                    self.assertIsInstance(i, np.ndarray)
                    for i in predicted_probabilities
                ]
            except np.linalg.LinAlgError:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Exemple #44
0
    def _test_configurations(self,
                             configurations_space,
                             make_sparse=False,
                             data=None,
                             init_params=None):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        print(configurations_space)
        for i in range(10):
            config = configurations_space.sample_configuration()
            config._populate_values()

            # Restrict configurations which could take too long on travis-ci
            restrictions = {
                'classifier:passive_aggressive:n_iter': 5,
                'classifier:sgd:n_iter': 5,
                'classifier:adaboost:n_estimators': 50,
                'classifier:adaboost:max_depth': 1,
                'preprocessor:kernel_pca:n_components': 10,
                'preprocessor:kitchen_sinks:n_components': 50,
                'preprocessor:gem:N': 5,
                'classifier:proj_logit:max_epochs': 1,
                'classifier:libsvm_svc:degree': 2,
                'regressor:libsvm_svr:degree': 2
            }

            for restrict_parameter in restrictions:
                restrict_to = restrictions[restrict_parameter]
                if restrict_parameter in config and \
                        config[restrict_parameter] is not None:
                    config._values[restrict_parameter] = restrict_to

            print(config)

            if data is None:
                X_train, Y_train, X_test, Y_test = get_dataset(
                    dataset='digits', make_sparse=make_sparse)
            else:
                X_train = data['X_train'].copy()
                Y_train = data['Y_train'].copy()
                X_test = data['X_test'].copy()
                Y_test = data['Y_test'].copy()

            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                init_params_ = copy.deepcopy(init_params)
                cls.fit(X_train, Y_train, init_params=init_params_)
                predictions = cls.predict(X_test)
            except MemoryError as e:
                continue
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0]:
                    continue
                elif "removed all features" in e.args[0]:
                    continue
                elif "all features are discarded" in e.args[0]:
                    continue
                elif "Numerical problems in QDA" in e.args[0]:
                    continue
                elif 'Bug in scikit-learn' in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Exemple #45
0
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=200, n_features=10, n_redundant=2, n_informative=2,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting in \
                [('adaboost', AdaboostClassifier, 0.810, 0.735),
                 ('decision_tree', DecisionTree, 0.780, 0.643),
                 ('extra_trees', ExtraTreesClassifier, 0.75, 0.800),
                 ('gradient_boosting', GradientBoostingClassifier,
                  0.789, 0.762),
                 ('random_forest', RandomForest, 0.75, 0.821),
                 ('libsvm_svc', LibSVM_SVC, 0.769, 0.72),
                 ('liblinear_svc', LibLinear_SVC, 0.762, 0.735),
                 ('sgd', SGD, 0.704, 0.667)
                ]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': [name],
                           'preprocessor': ['no_preprocessing']}
                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                classifier.set_hyperparameters(configuration=default)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.691, 0.692),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.692, 0.590)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': ['sgd'], 'preprocessor': [name]}

                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier.set_hyperparameters(default)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)
    def test_weighting_effect(self):
        data = sklearn.datasets.make_classification(
            n_samples=200, n_features=10, n_redundant=2, n_informative=2,
            n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2],
            random_state=1)

        for name, clf, acc_no_weighting, acc_weighting, places in \
                [('adaboost', AdaboostClassifier, 0.810, 0.735, 3),
                 ('decision_tree', DecisionTree, 0.780, 0.643, 3),
                 ('extra_trees', ExtraTreesClassifier, 0.780, 0.8, 3),
                 ('gradient_boosting', GradientBoostingClassifier,
                  0.737, 0.684, 3),
                 ('random_forest', RandomForest, 0.780, 0.789, 3),
                 ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3),
                 ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3),
                 ('passive_aggressive', PassiveAggressive, 0.642, 0.449, 3),
                 ('sgd', SGD, 0.818, 0.575, 2)
                ]:
            for strategy, acc in [
                ('none', acc_no_weighting),
                ('weighting', acc_weighting)
            ]:
                # Fit
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': [name],
                           'preprocessor': ['no_preprocessing']}
                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=places, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                classifier.set_hyperparameters(configuration=default)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=places)

        for name, pre, acc_no_weighting, acc_weighting in \
                [('extra_trees_preproc_for_classification',
                    ExtraTreesPreprocessorClassification, 0.810, 0.563),
                 ('liblinear_svc_preprocessor', LibLinear_Preprocessor,
                    0.837, 0.567)]:
            for strategy, acc in [('none', acc_no_weighting),
                                  ('weighting', acc_weighting)]:
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                include = {'classifier': ['sgd'], 'preprocessor': [name]}

                classifier = SimpleClassificationPipeline(
                    random_state=1, include=include)
                cs = classifier.get_hyperparameter_search_space()
                default = cs.get_default_configuration()
                default._values['balancing:strategy'] = strategy
                classifier.set_hyperparameters(default)
                predictor = classifier.fit(X_train, Y_train)
                predictions = predictor.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3, msg=(name, strategy))

                # fit_transformer and fit_estimator
                data_ = copy.copy(data)
                X_train = data_[0][:100]
                Y_train = data_[1][:100]
                X_test = data_[0][100:]
                Y_test = data_[1][100:]

                default._values['balancing:strategy'] = strategy
                classifier = SimpleClassificationPipeline(
                    default, random_state=1, include=include)
                Xt, fit_params = classifier.fit_transformer(X_train, Y_train)
                classifier.fit_estimator(Xt, Y_train, **fit_params)
                predictions = classifier.predict(X_test)
                self.assertAlmostEqual(
                    sklearn.metrics.f1_score(predictions, Y_test), acc,
                    places=3)