Esempio n. 1
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 2
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'classifier': ['decision_tree']})
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched(self):
        # Multiclass
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')

        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": 'True',
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                "rescaling:__choice__": "min/max"
            })

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 5
0
    def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(
            self):
        cs = SimpleClassificationPipeline(
            include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'qda')

        cs = SimpleClassificationPipeline(include={'preprocessor': ['nystroem_sampler']}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
Esempio n. 6
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
def get_models_performance_by_data(input):
    X = input[0]
    y = input[1]
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    model = random_model()
    train_accuracy_score = []
    test_accuracy_score = []
    train_log_loss = []
    test_log_loss = []
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    time_start = time.time()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        p = SimpleClassificationPipeline(config=model)
        p.fit(X_train, y_train)
        y_train_pred = p.predict(X_train)
        y_test_pred = p.predict(X_test)
        train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
        test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
        train_log_loss.append(log_loss(y_train, y_train_pred))
        test_log_loss.append(log_loss(y_test, y_test_pred))
    time_end = time.time()
    duration = time_end - time_start
    models_performance = {
        "train_accuracy_score": np.mean(train_accuracy_score),
        "test_accuracy_score": np.mean(test_accuracy_score),
        "train_log_loss": np.mean(train_log_loss),
        "test_log_loss": np.mean(test_log_loss),
        "duration": duration / 5
    }
    return models_performance
def get_performance_of_encoded_model(data_set, encoded_model, verbose=False):
    """
    Get model performance array(4 * 1) from encoded model vector(17 * 1)
    data_set : (X, y) input dataset to get performance
    encoded_model : encoded model choice vector (17 * 1)
    verbose : if True, will log model choice dictionary and model performance array
    return : model performance vector(4 * 1)
    """
    train_accuracy_score = []
    test_accuracy_score = []
    train_log_loss = []
    test_log_loss = []
    X, y = data_set
    #kf = KFold(n_splits=5, random_state=1, shuffle=True)
    model = decode_model(encoded_model)
    if verbose:
        print('Model choice: {0}'.format(model))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        
    p = SimpleClassificationPipeline(config=model)
    p.fit(X_train, y_train)
    #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
    #print(scores)
    y_train_pred = p.predict(X_train)
    y_test_pred = p.predict(X_test)
    train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
    test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
    train_log_loss.append(log_loss(y_train, y_train_pred))
    test_log_loss.append(log_loss(y_test, y_test_pred))
    model_performance = np.array([np.mean(train_accuracy_score), np.mean(test_accuracy_score), np.mean(train_log_loss), np.mean(test_log_loss)])
    if verbose:
        print('Model Performance: {o}'.format(model_performance))
    return model_performance
Esempio n. 9
0
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            include={'classifier': ['decision_tree']},
            dataset_properties={'sparse': True})
        config = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_fit_instantiates_component(self):
        """Make sure that if a preprocessor is added, it's fit
        method is called"""
        preprocessing_components.add_preprocessor(CrashPreprocessor)

        # We reduce the search space as forbidden clauses prevent to instantiate
        # the user defined preprocessor manually
        cls = SimpleClassificationPipeline(include={'classifier': ['random_forest']})
        cs = cls.get_hyperparameter_search_space()
        self.assertIn('CrashPreprocessor', str(cs))
        config = cs.sample_configuration()
        try:
            config['feature_preprocessor:__choice__'] = 'CrashPreprocessor'
        except Exception as e:
            # In case of failure clean up the components and print enough information
            # to clean up with check in the future
            del preprocessing_components._addons.components['CrashPreprocessor']
            self.fail("cs={} config={} Exception={}".format(cs, config, e))
        cls.set_hyperparameters(config)
        with self.assertRaisesRegex(
            ValueError,
            "Make sure fit is called"
        ):
            cls.fit(
                X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]),
                y=np.array([1, 0, 1, 1])
            )
        del preprocessing_components._addons.components['CrashPreprocessor']
    def test_configurations_signed_data(self):
        dataset_properties = {'signed': True}
        cs = SimpleClassificationPipeline(dataset_properties=dataset_properties)\
            .get_hyperparameter_search_space()

        self._test_configurations(configurations_space=cs,
                                  dataset_properties=dataset_properties)
    def test_get_hyperparameter_search_space(self):
        cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
        self.assertIsInstance(cs, ConfigurationSpace)
        conditions = cs.get_conditions()
        forbiddens = cs.get_forbiddens()

        self.assertEqual(
            len(
                cs.get_hyperparameter(
                    'data_preprocessing:numerical_transformer:rescaling:__choice__'
                ).choices), 7)
        self.assertEqual(
            len(cs.get_hyperparameter('classifier:__choice__').choices), 16)
        self.assertEqual(
            len(
                cs.get_hyperparameter(
                    'feature_preprocessor:__choice__').choices), 13)

        hyperparameters = cs.get_hyperparameters()
        self.assertEqual(167, len(hyperparameters))

        # for hp in sorted([str(h) for h in hyperparameters]):
        #    print hp

        # The four components which are always active are classifier,
        # feature preprocessor, balancing and data preprocessing pipeline.
        self.assertEqual(len(hyperparameters) - 7, len(conditions))

        self.assertEqual(len(forbiddens), 53)
Esempio n. 13
0
def _get_classification_configuration_space(info, include, exclude):
    task_type = info['task']

    multilabel = False
    multiclass = False
    sparse = False

    if task_type == MULTILABEL_CLASSIFICATION:
        multilabel = True
    if task_type == REGRESSION:
        raise NotImplementedError()
    if task_type == MULTICLASS_CLASSIFICATION:
        multiclass = True
    if task_type == BINARY_CLASSIFICATION:
        pass

    if info['is_sparse'] == 1:
        sparse = True

    dataset_properties = {
        'multilabel': multilabel,
        'multiclass': multiclass,
        'sparse': sparse
    }

    return SimpleClassificationPipeline(
        dataset_properties=dataset_properties,
        include=include, exclude=exclude).\
        get_hyperparameter_search_space()
Esempio n. 14
0
    def test_configurations_categorical_data(self):
        cs = SimpleClassificationPipeline(dataset_properties={'sparse': True}).\
            get_hyperparameter_search_space()

        categorical = [
            True, True, True, False, False, True, True, True, False, True,
            True, True, True, True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, True, True, True, True,
            False, False, False, True, True, True
        ]
        this_directory = os.path.dirname(__file__)
        X = np.loadtxt(
            os.path.join(this_directory, "components", "data_preprocessing",
                         "dataset.pkl"))
        y = X[:, -1].copy()
        X = X[:, :-1]

        # In order to usefully test the neural networks
        _, y = np.unique(y, return_inverse=True)

        X_train, X_test, Y_train, Y_test = \
            sklearn.model_selection.train_test_split(X, y)
        data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        }

        init_params = {'one_hot_encoding:categorical_features': categorical}

        self._test_configurations(configurations_space=cs,
                                  make_sparse=True,
                                  data=data,
                                  init_params=init_params)
 def test_add_classifier(self):
     self.assertEqual(len(classification_components._addons.components), 0)
     classification_components.add_classifier(DummyClassifier)
     self.assertEqual(len(classification_components._addons.components), 1)
     cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
     self.assertIn('DummyClassifier', str(cs))
     del classification_components._addons.components['DummyClassifier']
    def test_configurations_categorical_data(self):
        cs = SimpleClassificationPipeline(
            dataset_properties={'sparse': False},
            include={
                'feature_preprocessor': ['no_preprocessing'],
                'classifier': ['sgd', 'adaboost']
            }
        ).get_hyperparameter_search_space()

        categorical = [True, True, True, False, False, True, True, True,
                       False, True, True, True, True, True, True, True,
                       True, True, True, True, True, True, True, True, True,
                       True, True, True, True, True, True, True, False,
                       False, False, True, True, True]
        this_directory = os.path.dirname(__file__)
        X = np.loadtxt(os.path.join(this_directory, "components",
                                    "data_preprocessing", "dataset.pkl"))
        y = X[:, -1].copy()
        X = X[:, :-1]
        X_train, X_test, Y_train, Y_test = \
            sklearn.model_selection.train_test_split(X, y)
        data = {'X_train': X_train, 'Y_train': Y_train,
                'X_test': X_test, 'Y_test': Y_test}

        init_params = {
            'data_preprocessing:categorical_features':
                categorical
        }

        self._test_configurations(configurations_space=cs, make_sparse=True,
                                  data=data, init_params=init_params)
Esempio n. 17
0
def _get_classification_configuration_space(
        info: Dict[str, Any], include: Optional[Dict[str, List[str]]],
        exclude: Optional[Dict[str, List[str]]]) -> ConfigurationSpace:
    task_type = info['task']

    multilabel = False
    multiclass = False
    sparse = False

    if task_type == MULTILABEL_CLASSIFICATION:
        multilabel = True
    if task_type == MULTICLASS_CLASSIFICATION:
        multiclass = True
    if task_type == BINARY_CLASSIFICATION:
        pass

    if info['is_sparse'] == 1:
        sparse = True

    dataset_properties = {
        'multilabel': multilabel,
        'multiclass': multiclass,
        'sparse': sparse
    }

    return SimpleClassificationPipeline(
        dataset_properties=dataset_properties,
        include=include, exclude=exclude).\
        get_hyperparameter_search_space()
 def test_add_preprocessor(self):
     self.assertEqual(len(preprocessing_components._addons.components), 0)
     preprocessing_components.add_preprocessor(DummyPreprocessor)
     self.assertEqual(len(preprocessing_components._addons.components), 1)
     cs = SimpleClassificationPipeline().get_hyperparameter_search_space()
     self.assertIn('DummyPreprocessor', str(cs))
     del preprocessing_components._addons.components['DummyPreprocessor']
    def test_predict_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(
            list([(list([1 if i != y else 0 for i in range(10)]))
                  for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_multilabel(self):
        cache = Memory(location=tempfile.gettempdir())
        cached_func = cache.cache(
            sklearn.datasets.make_multilabel_classification
        )
        X, Y = cached_func(
            n_samples=150,
            n_features=20,
            n_classes=5,
            n_labels=2,
            length=50,
            allow_unlabeled=True,
            sparse=False,
            return_indicator=True,
            return_distributions=False,
            random_state=1
        )
        X_train = X[:100, :]
        Y_train = Y[:100, :]
        X_test = X[101:, :]
        Y_test = Y[101:, ]

        data = {'X_train': X_train, 'Y_train': Y_train,
                'X_test': X_test, 'Y_test': Y_test}

        dataset_properties = {'multilabel': True}
        cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\
            get_hyperparameter_search_space()
        self._test_configurations(configurations_space=cs, data=data)
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.96, sklearn.metrics.accuracy_score(predictions, Y_test))
         auto.predict_proba(X_test)
def get_models_performance(reproduce_num, data_set_idx):
    '''
    reproduce_num: the number of model choices for the dataset to reproduce
    data_set_idx: generated data set index, will load tried models for the dataset json file
    return: reproduced models performance json file
    '''
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    tried_models_filename = "./log/classifier_log" + str(
        data_set_idx) + "/tried_models_for_dataset" + str(
            data_set_idx) + ".json"
    models_performance = {}
    # duration = get_training_duration(data_set_idx)
    with open(tried_models_filename) as fp:
        models = json.load(fp)
        reproduce_num_act = min(len(models), reproduce_num)
        for i in range(1, reproduce_num_act + 1):
            model = models[str(i)]
            #print(model)
            train_accuracy_score = []
            test_accuracy_score = []
            train_log_loss = []
            test_log_loss = []
            #kf = KFold(n_splits=5, random_state=1, shuffle=True)
            time_start = time.time()
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.25, random_state=42, shuffle=True)

            p = SimpleClassificationPipeline(config=model)
            p.fit(X_train, y_train)
            #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
            #print(scores)
            y_train_pred = p.predict(X_train)
            y_test_pred = p.predict(X_test)
            train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
            test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
            train_log_loss.append(log_loss(y_train, y_train_pred))
            test_log_loss.append(log_loss(y_test, y_test_pred))
            time_end = time.time()
            duration = time_end - time_start
            models_performance[i] = {
                "train_accuracy_score": np.mean(train_accuracy_score),
                "test_accuracy_score": np.mean(test_accuracy_score),
                "train_log_loss": np.mean(train_log_loss),
                "test_log_loss": np.mean(test_log_loss),
                "duration": duration
            }
            #if i in duration:
            #    models_performance[i]["duration"] = duration[i]
    repreduce_performance_json_filename = "./log/classifier_log" + str(
        data_set_idx) + "/reproduce_models_performance" + str(
            data_set_idx) + ".json"
    with open(repreduce_performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance
 def test_default_configuration_iterative_fit(self):
     classifier = SimpleClassificationPipeline(
         include={'classifier': ['random_forest'],
                  'feature_preprocessor': ['no_preprocessing']})
     X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
     classifier.fit_transformer(X_train, Y_train)
     for i in range(1, 11):
         classifier.iterative_fit(X_train, Y_train)
         self.assertEqual(classifier.steps[-1][-1].choice.estimator.n_estimators,
                          i)
Esempio n. 24
0
 def test_default_configuration(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Esempio n. 25
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                    config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                    config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = SimpleClassificationPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Esempio n. 26
0
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": "True",
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                "rescaling:__choice__": "min/max"
            })
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Esempio n. 27
0
def max_estimators_fit_duration(X,
                                y,
                                max_classifier_time_budget,
                                logger,
                                sample_factor=1):
    lo = utl.get_logger(inspect.stack()[0][3])

    lo.info("Constructing preprocessor pipeline and transforming sample data")
    # we don't care about the data here but need to preprocess, otherwise the classifiers crash

    pipeline = SimpleClassificationPipeline(include={
        'imputation': ['most_frequent'],
        'rescaling': ['standardize']
    })
    default_cs = pipeline.get_hyperparameter_search_space(
    ).get_default_configuration()
    pipeline = pipeline.set_hyperparameters(default_cs)

    pipeline.fit(X, y)
    X_tr, dummy = pipeline.fit_transformer(X, y)

    lo.info("Running estimators on the sample")
    # going over all default classifiers used by auto-sklearn
    clfs = autosklearn.pipeline.components.classification._classifiers

    processes = []
    with multiprocessing.Manager() as manager:
        max_clf_time = manager.Value('i', 3)  # default 3 sec
        for clf_name, clf_class in clfs.items():
            pr = multiprocessing.Process(target=time_single_estimator,
                                         name=clf_name,
                                         args=(clf_name, clf_class, X_tr, y,
                                               max_clf_time, logger))
            pr.start()
            processes.append(pr)
        for pr in processes:
            pr.join(max_classifier_time_budget
                    )  # will block for max_classifier_time_budget or
            # until the classifier fit process finishes. After max_classifier_time_budget
            # we will terminate all still running processes here.
            if pr.is_alive():
                logger.info("Terminating " + pr.name +
                            " process due to timeout")
                pr.terminate()
        result_max_clf_time = max_clf_time.value

    lo.info("Test classifier fit completed")

    per_run_time_limit = int(sample_factor * result_max_clf_time)
    return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
 def test_default_configuration_multilabel(self):
     for i in range(2):
         classifier = SimpleClassificationPipeline(
             random_state=1, dataset_properties={'multilabel': True})
         cs = classifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(
             dataset='iris', make_multilabel=True)
         classifier.set_hyperparameters(default)
         classifier = classifier.fit(X_train, Y_train)
         predictions = classifier.predict(X_test)
         self.assertAlmostEqual(
             0.96, sklearn.metrics.accuracy_score(predictions, Y_test))
         classifier.predict_proba(X_test)
Esempio n. 29
0
    def test_get_hyperparameter_search_space_include_exclude_models(self):
        cs = SimpleClassificationPipeline(include={'classifier': ['libsvm_svc']})\
            .get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('classifier:__choice__'),
            CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc']))

        cs = SimpleClassificationPipeline(exclude={'classifier': ['libsvm_svc']}).\
            get_hyperparameter_search_space()
        self.assertNotIn('libsvm_svc', str(cs))

        cs = SimpleClassificationPipeline(
            include={'preprocessor': ['select_percentile_classification']}).\
            get_hyperparameter_search_space()
        self.assertEqual(
            cs.get_hyperparameter('preprocessor:__choice__'),
            CategoricalHyperparameter('preprocessor:__choice__',
                                      ['select_percentile_classification']))

        cs = SimpleClassificationPipeline(exclude={
            'preprocessor': ['select_percentile_classification']
        }).get_hyperparameter_search_space()
        self.assertNotIn('select_percentile_classification', str(cs))
def get_performance_of_range_encoded_models(data_set_idx, encoded_all_model_hyperparameters, json_model, verbose=False):
    """
    Get models performance (30 * 5) from encoded model choice matrix (30 * 38)
    """
    X = np.loadtxt('Data_Set/X_' + str(data_set_idx))
    y = np.loadtxt('Data_Set/y_' + str(data_set_idx))
    probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx))
    # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    models_performance = {}
    #get_performance_of_encoded_model([X, y], encoded_all_model_hyperparameters[0])
    for i in range(len(encoded_all_model_hyperparameters)):
        #model = models[str(i)]
        encoded_model = encoded_all_model_hyperparameters[i]
        model = decode_model(encoded_model)
        if verbose:
            print('Original json model: ', json_model[str(i+1)])
            print('Encoded model: ', encoded_model)
            print('Decoded model:' , model)
            print("==========================================================")
        train_accuracy_score = []
        test_accuracy_score = []
        train_log_loss = []
        test_log_loss = []
        #kf = KFold(n_splits=5, random_state=1)
        time_start = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)
        
        p = SimpleClassificationPipeline(config=model)
        p.fit(X_train, y_train)
        #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True)
        #print(scores)
        y_train_pred = p.predict(X_train)
        y_test_pred = p.predict(X_test)
        train_accuracy_score.append(accuracy_score(y_train, y_train_pred))
        test_accuracy_score.append(accuracy_score(y_test, y_test_pred))
        train_log_loss.append(log_loss(y_train, y_train_pred))
        test_log_loss.append(log_loss(y_test, y_test_pred))
        time_end = time.time()
        duration = time_end - time_start
        models_performance[i] = {"train_accuracy_score": np.mean(train_accuracy_score),
                         "test_accuracy_score": np.mean(test_accuracy_score),
                         "train_log_loss" : np.mean(train_log_loss),
                         "test_log_loss" : np.mean(test_log_loss),
                         "duration" : duration}
           
    performance_json_filename = "./log/classifier_log" + str(data_set_idx) + "/reproduce_models_performance" + str(data_set_idx) + ".json"
    with open(performance_json_filename, 'w') as fp:
        json.dump(models_performance, fp)
    return models_performance