def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={"preprocessor": ["densifier"]}, dataset_properties={"sparse": True} ) self.assertEqual(cs.get_hyperparameter("classifier:__choice__").default, "qda") cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={"preprocessor": ["nystroem_sampler"]} ) self.assertEqual(cs.get_hyperparameter("classifier:__choice__").default, "sgd")
def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default, 'qda') cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'preprocessor': ['nystroem_sampler']}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
def test_fit_instantiates_component(self): """Make sure that if a preprocessor is added, it's fit method is called""" preprocessing_components.add_preprocessor(CrashPreprocessor) # We reduce the search space as forbidden clauses prevent to instantiate # the user defined preprocessor manually cls = SimpleClassificationPipeline(include={'classifier': ['random_forest']}) cs = cls.get_hyperparameter_search_space() self.assertIn('CrashPreprocessor', str(cs)) config = cs.sample_configuration() try: config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' except Exception as e: # In case of failure clean up the components and print enough information # to clean up with check in the future del preprocessing_components._addons.components['CrashPreprocessor'] self.fail("cs={} config={} Exception={}".format(cs, config, e)) cls.set_hyperparameters(config) with self.assertRaisesRegex( ValueError, "Make sure fit is called" ): cls.fit( X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), y=np.array([1, 0, 1, 1]) ) del preprocessing_components._addons.components['CrashPreprocessor']
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}, dataset_properties={'sparse': True}) config = cs.get_default_configuration() cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_configurations_categorical_data(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True ] this_directory = os.path.dirname(__file__) X = np.loadtxt( os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) data = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test } init_params = {'one_hot_encoding:categorical_features': categorical} self._test_configurations(configurations_space=cs, make_sparse=True, data=data, init_params=init_params)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleClassificationPipeline(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_add_preprocessor(self): self.assertEqual(len(preprocessing_components._addons.components), 0) preprocessing_components.add_preprocessor(DummyPreprocessor) self.assertEqual(len(preprocessing_components._addons.components), 1) cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIn("DummyPreprocessor", str(cs)) del preprocessing_components._addons.components["DummyPreprocessor"]
def test_multilabel(self): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] data = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test } dataset_properties = {'multilabel': True} cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties=dataset_properties) self._test_configurations(configurations_space=cs)
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}) default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleClassificationPipeline(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_add_preprocessor(self): self.assertEqual(len(preprocessing_components._addons.components), 0) preprocessing_components.add_preprocessor(DummyPreprocessor) self.assertEqual(len(preprocessing_components._addons.components), 1) cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIn('DummyPreprocessor', str(cs)) del preprocessing_components._addons.components['DummyPreprocessor']
def test_add_classifier(self): self.assertEqual(len(classification_components._addons.components), 0) classification_components.add_classifier(DummyClassifier) self.assertEqual(len(classification_components._addons.components), 1) cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIn('DummyClassifier', str(cs)) del classification_components._addons.components['DummyClassifier']
def _get_classification_configuration_space(info, include): task_type = info['task'] multilabel = False multiclass = False sparse = False if task_type == MULTILABEL_CLASSIFICATION: multilabel = True if task_type == REGRESSION: raise NotImplementedError() if task_type == MULTICLASS_CLASSIFICATION: multiclass = True if task_type == BINARY_CLASSIFICATION: pass if info['is_sparse'] == 1: sparse = True dataset_properties = { 'multilabel': multilabel, 'multiclass': multiclass, 'sparse': sparse } return SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties=dataset_properties, include=include)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train_ = np.zeros((Y_train.shape[0], 10)) for i, y in enumerate(Y_train): Y_train_[i][y] = 1 Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_add_classifier(self): self.assertEqual(len(classification_components._addons.components), 0) classification_components.add_classifier(DummyClassifier) self.assertEqual(len(classification_components._addons.components), 1) cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIn("DummyClassifier", str(cs)) del classification_components._addons.components["DummyClassifier"]
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max" }) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multiclass": True}) self.assertNotIn("bernoulli_nb", str(cs_mc)) cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True}) self.assertNotIn("k_nearest_neighbors", str(cs_ml)) self.assertNotIn("liblinear", str(cs_ml)) self.assertNotIn("libsvm_svc", str(cs_ml)) self.assertNotIn("sgd", str(cs_ml)) cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) self.assertIn("extra_trees", str(cs_sp)) self.assertIn("gradient_boosting", str(cs_sp)) self.assertIn("random_forest", str(cs_sp)) cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={"multilabel": True, "multiclass": True} ) self.assertEqual(cs_ml, cs_mc_ml)
def test_default_configuration_multilabel(self): for i in range(2): cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True}) default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris", make_multilabel=True) auto = SimpleClassificationPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_get_hyperparameter_search_space_include_exclude_models(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['libsvm_svc']}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__'), CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc'])) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( exclude={'classifier': ['libsvm_svc']}) self.assertNotIn('libsvm_svc', str(cs)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'preprocessor': ['select_percentile_classification']}) self.assertEqual(cs.get_hyperparameter('preprocessor:__choice__'), CategoricalHyperparameter('preprocessor:__choice__', ['select_percentile_classification'])) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( exclude={'preprocessor': ['select_percentile_classification']}) self.assertNotIn('select_percentile_classification', str(cs))
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max"}) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train_ = np.zeros((Y_train.shape[0], 10)) for i, y in enumerate(Y_train): Y_train_[i][y] = 1 Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_default_configuration(self): for i in range(2): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multiclass': True}) self.assertNotIn('bernoulli_nb', str(cs_mc)) cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multilabel': True}) self.assertNotIn('k_nearest_neighbors', str(cs_ml)) self.assertNotIn('liblinear', str(cs_ml)) self.assertNotIn('libsvm_svc', str(cs_ml)) self.assertNotIn('sgd', str(cs_ml)) cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) self.assertIn('extra_trees', str(cs_sp)) self.assertIn('gradient_boosting', str(cs_sp)) self.assertIn('random_forest', str(cs_sp)) cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multilabel': True, 'multiclass': True}) self.assertEqual(cs_ml, cs_mc_ml)
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", "classifier:random_forest:bootstrap": "True", "classifier:random_forest:criterion": "gini", "classifier:random_forest:max_depth": "None", "classifier:random_forest:min_samples_split": 2, "classifier:random_forest:min_samples_leaf": 2, "classifier:random_forest:min_weight_fraction_leaf": 0.0, "classifier:random_forest:max_features": 0.5, "classifier:random_forest:max_leaf_nodes": "None", "classifier:random_forest:n_estimators": 100, "rescaling:__choice__": "min/max", }, ) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multiclass': True}) self.assertNotIn('bernoulli_nb', str(cs_mc)) cs_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multilabel': True}) self.assertNotIn('k_nearest_neighbors', str(cs_ml)) self.assertNotIn('liblinear', str(cs_ml)) self.assertNotIn('libsvm_svc', str(cs_ml)) self.assertNotIn('sgd', str(cs_ml)) cs_sp = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) self.assertIn('extra_trees', str(cs_sp)) self.assertIn('gradient_boosting', str(cs_sp)) self.assertIn('random_forest', str(cs_sp)) cs_mc_ml = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'multilabel': True, 'multiclass': True}) self.assertEqual(cs_ml, cs_mc_ml)
def test_get_hyperparameter_search_space_include_exclude_models(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space(include={"classifier": ["libsvm_svc"]}) self.assertEqual( cs.get_hyperparameter("classifier:__choice__"), CategoricalHyperparameter("classifier:__choice__", ["libsvm_svc"]), ) cs = SimpleClassificationPipeline.get_hyperparameter_search_space(exclude={"classifier": ["libsvm_svc"]}) self.assertNotIn("libsvm_svc", str(cs)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={"preprocessor": ["select_percentile_classification"]} ) self.assertEqual( cs.get_hyperparameter("preprocessor:__choice__"), CategoricalHyperparameter("preprocessor:__choice__", ["select_percentile_classification"]), ) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( exclude={"preprocessor": ["select_percentile_classification"]} ) self.assertNotIn("select_percentile_classification", str(cs))
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, "rescaling:__choice__": "min/max" }) cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_default_configuration_multilabel(self): for i in range(2): classifier = SimpleClassificationPipeline( random_state=1, dataset_properties={'multilabel': True}) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset( dataset='iris', make_multilabel=True) classifier.set_hyperparameters(default) classifier = classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) self.assertAlmostEqual( 0.96, sklearn.metrics.accuracy_score(predictions, Y_test)) classifier.predict_proba(X_test)
def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): cls = SimpleClassificationPipeline(init_params={ 'one_hot_encoding:categorical_features': [True, False] }) self.assertEqual(ohe_mock.call_args[1]['init_params'], {'categorical_features': [True, False]}) default = cls.get_hyperparameter_search_space( ).get_default_configuration() cls.set_hyperparameters(configuration=default, init_params={ 'one_hot_encoding:categorical_features': [True, True, False] }) self.assertEqual(ohe_mock.call_args[1]['init_params'], {'categorical_features': [True, True, False]})
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, "rescaling:__choice__": "min/max"}) cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_default_configuration_multilabel(self): for i in range(2): dataset_properties = {'multilabel': True} classifier = SimpleClassificationPipeline( dataset_properties=dataset_properties) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris', make_multilabel=True) classifier.set_hyperparameters(default) classifier = classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) self.assertAlmostEqual(0.94, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = classifier.predict_proba(X_test)
def test_get_hyperparameter_search_space(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() self.assertEqual(len(cs.get_hyperparameter("rescaling:__choice__").choices), 4) self.assertEqual(len(cs.get_hyperparameter("classifier:__choice__").choices), 17) self.assertEqual(len(cs.get_hyperparameter("preprocessor:__choice__").choices), 14) hyperparameters = cs.get_hyperparameters() self.assertEqual(157, len(hyperparameters)) # for hp in sorted([str(h) for h in hyperparameters]): # print hp # The four parameters which are always active are classifier, # preprocessor, imputation strategy and scaling strategy self.assertEqual(len(hyperparameters) - 6, len(conditions))
def test_fit_instantiates_component(self): """Make sure that if a preprocessor is added, it's fit method is called""" preprocessing_components.add_preprocessor(CrashPreprocessor) cls = SimpleClassificationPipeline() cs = cls.get_hyperparameter_search_space() self.assertIn('CrashPreprocessor', str(cs)) config = cs.sample_configuration() config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' cls.set_hyperparameters(config) with self.assertRaisesRegex( ValueError, "Make sure fit is called" ): cls.fit( X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), y=np.array([1, 0, 1, 1]) ) del preprocessing_components._addons.components['CrashPreprocessor']
def test_get_hyperparameter_search_space(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() self.assertEqual( len(cs.get_hyperparameter('rescaling:__choice__').choices), 4) self.assertEqual( len(cs.get_hyperparameter('classifier:__choice__').choices), 17) self.assertEqual( len(cs.get_hyperparameter('preprocessor:__choice__').choices), 14) hyperparameters = cs.get_hyperparameters() self.assertEqual(157, len(hyperparameters)) #for hp in sorted([str(h) for h in hyperparameters]): # print hp # The four parameters which are always active are classifier, # preprocessor, imputation strategy and scaling strategy self.assertEqual(len(hyperparameters) - 6, len(conditions))
def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): cls = SimpleClassificationPipeline( init_params={ 'categorical_encoding:one_hot_encoding:categorical_features': [True, False] } ) self.assertEqual( ohe_mock.call_args[1]['init_params'], {'one_hot_encoding:categorical_features': [True, False]} ) default = cls.get_hyperparameter_search_space().get_default_configuration() cls.set_hyperparameters(configuration=default, init_params={ 'categorical_encoding:one_hot_encoding:categorical_features': [True, True, False] } ) self.assertEqual( ohe_mock.call_args[1]['init_params'], {'one_hot_encoding:categorical_features': [True, True, False]} )
def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): # Mock the _check_init_params_honored as there is no object created, # _check_init_params_honored will fail as a datapreprocessor was never created with unittest.mock.patch( 'autosklearn.pipeline.classification.SimpleClassificationPipeline' '._check_init_params_honored'): cls = SimpleClassificationPipeline(init_params={ 'data_preprocessing:feat_type': { 0: 'categorical', 1: 'numerical' } }) self.assertEqual(ohe_mock.call_args[1]['init_params'], {'feat_type': { 0: 'categorical', 1: 'numerical' }}) default = cls.get_hyperparameter_search_space( ).get_default_configuration() cls.set_hyperparameters( configuration=default, init_params={ 'data_preprocessing:feat_type': { 0: 'categorical', 1: 'categorical', 2: 'numerical' } }, ) self.assertEqual(ohe_mock.call_args[1]['init_params'], { 'feat_type': { 0: 'categorical', 1: 'categorical', 2: 'numerical' } })
def max_estimators_fit_duration(X,y,max_classifier_time_budget,sample_factor=1): p("Constructing preprocessor pipeline and transforming sample data") # we don't care about the data here but need to preprocess, otherwise the classifiers crash pipeline = SimpleClassificationPipeline( include={'imputation': ['most_frequent'], 'rescaling': ['standardize']}) default_cs = pipeline.get_hyperparameter_search_space().get_default_configuration() pipeline = pipeline.set_hyperparameters(default_cs) pipeline.fit(X, y) X_tr, dummy = pipeline.pre_transform(X, y) p("Running estimators on the sample") # going over all default classifiers used by auto-sklearn clfs=autosklearn.pipeline.components.classification._classifiers processes = [] with multiprocessing.Manager() as manager: max_clf_time=manager.Value('i',3) # default 3 sec for clf_name,clf_class in clfs.items() : pr = multiprocessing.Process( target=time_single_estimator, name=clf_name , args=(clf_name, clf_class, X_tr, y, max_clf_time)) pr.start() processes.append(pr) for pr in processes: pr.join(max_classifier_time_budget) # will block for max_classifier_time_budget or # until the classifier fit process finishes. After max_classifier_time_budget # we will terminate all still running processes here. if pr.is_alive(): p("Terminating "+pr.name+" process due to timeout") pr.terminate() result_max_clf_time=max_clf_time.value p("Test classifier fit completed") per_run_time_limit = int(sample_factor*result_max_clf_time) return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 print(config) categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, ] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, y) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train, init_params={"one_hot_encoding:categorical_features": categorical}) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.709, 0.658), ('decision_tree', DecisionTree, 0.683, 0.701), ('extra_trees', ExtraTreesClassifier, 0.812, 0.8), ('gradient_boosting', GradientBoostingClassifier, 0.800, 0.760), ('random_forest', RandomForest, 0.846, 0.792), ('libsvm_svc', LibSVM_SVC, 0.571, 0.658), ('liblinear_svc', LibLinear_SVC, 0.685, 0.699), ('sgd', SGD, 0.65384615384615385, 0.38795986622073581)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.\ get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score(predictions, Y_test), places=3) # pre_transform and fit_estimator data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.7142857142857143, 0.72180451127819545), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.5934065934065933, 0.71111111111111114)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3) # pre_transform and fit_estimator data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3)
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=200, n_features=10, n_redundant=2, n_informative=2, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting, places in \ [('adaboost', AdaboostClassifier, 0.810, 0.735, 3), ('decision_tree', DecisionTree, 0.780, 0.643, 3), ('extra_trees', ExtraTreesClassifier, 0.780, 0.8, 3), ('gradient_boosting', GradientBoostingClassifier, 0.737, 0.684, 3), ('random_forest', RandomForest, 0.780, 0.789, 3), ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3), ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3), ('passive_aggressive', PassiveAggressive, 0.642, 0.449, 3), ('sgd', SGD, 0.818, 0.575, 2) ]: for strategy, acc in [ ('none', acc_no_weighting), ('weighting', acc_weighting) ]: # Fit data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': [name], 'preprocessor': ['no_preprocessing']} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=places, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] classifier = SimpleClassificationPipeline( default, random_state=1, include=include) classifier.set_hyperparameters(configuration=default) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=places) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.810, 0.563), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.837, 0.567)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': ['sgd'], 'preprocessor': [name]} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier.set_hyperparameters(default) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3)
def test_configurations(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() self._test_configurations(configurations_space=cs)
def test_repr(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() representation = repr(SimpleClassificationPipeline(default)) cls = eval(representation) self.assertIsInstance(cls, SimpleClassificationPipeline)
def test_configurations_signed_data(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) self._test_configurations(configurations_space=cs)
def test_configurations_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) self._test_configurations(configurations_space=cs, make_sparse=True)
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) categorical = [True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:,:-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) cls = SimpleClassificationPipeline(config, random_state=1,) try: cls.fit(X_train, Y_train, init_params={'one_hot_encoding:categorical_features': categorical}) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_set_hyperparameters_honors_configuration(self): """Makes sure that a given configuration is honored in practice. This method tests that the set hyperparameters actually create objects that comply with the given configuration. It iterates trough the pipeline to make sure we did not miss a step, but also checks at the end that every configuration from Config was checked """ all_combinations = list(itertools.product([True, False], repeat=4)) for sparse, multilabel, signed, multiclass, in all_combinations: dataset_properties = { 'sparse': sparse, 'multilabel': multilabel, 'multiclass': multiclass, 'signed': signed, } cls = SimpleClassificationPipeline( random_state=1, dataset_properties=dataset_properties, ) cs = cls.get_hyperparameter_search_space() config = cs.sample_configuration() # Set hyperparameters takes a given config and translate # a config to an actual implementation cls.set_hyperparameters(config) config_dict = config.get_dictionary() # keys to check is our mechanism to ensure that every # every config key is checked keys_checked = [] for name, step in cls.named_steps.items(): if name == 'data_preprocessing': # We have to check both the numerical and categorical to_check = { 'numerical_transformer': step.numer_ppl.named_steps, 'categorical_transformer': step.categ_ppl.named_steps, } for data_type, pipeline in to_check.items(): for sub_name, sub_step in pipeline.items(): # If it is a Choice, make sure it is the correct one! if isinstance(sub_step, AutoSklearnChoice): key = "data_preprocessing:{}:{}:__choice__".format( data_type, sub_name ) keys_checked.extend( self._test_set_hyperparameter_choice( key, sub_step, config_dict ) ) # If it is a component, make sure it has the correct hyperparams elif isinstance(sub_step, AutoSklearnComponent): keys_checked.extend( self._test_set_hyperparameter_component( "data_preprocessing:{}:{}".format( data_type, sub_name ), sub_step, config_dict ) ) else: raise ValueError("New type of pipeline component!") elif name == 'balancing': keys_checked.extend( self._test_set_hyperparameter_component( 'balancing', step, config_dict ) ) elif name == 'feature_preprocessor': keys_checked.extend( self._test_set_hyperparameter_choice( 'feature_preprocessor:__choice__', step, config_dict ) ) elif name == 'classifier': keys_checked.extend( self._test_set_hyperparameter_choice( 'classifier:__choice__', step, config_dict ) ) else: raise ValueError("Found another type of step! Need to update this check") # Make sure we checked the whole configuration self.assertSetEqual(set(config_dict.keys()), set(keys_checked))
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=200, n_features=10, n_redundant=2, n_informative=2, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.810, 0.735), ('decision_tree', DecisionTree, 0.780, 0.643), ('extra_trees', ExtraTreesClassifier, 0.75, 0.800), ('gradient_boosting', GradientBoostingClassifier, 0.789, 0.762), ('random_forest', RandomForest, 0.75, 0.821), ('libsvm_svc', LibSVM_SVC, 0.769, 0.72), ('liblinear_svc', LibLinear_SVC, 0.762, 0.735), ('sgd', SGD, 0.704, 0.667) ]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': [name], 'preprocessor': ['no_preprocessing']} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] classifier = SimpleClassificationPipeline( default, random_state=1, include=include) classifier.set_hyperparameters(configuration=default) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.691, 0.692), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.692, 0.590)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': ['sgd'], 'preprocessor': [name]} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier.set_hyperparameters(default) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3)
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {"multilabel": True} cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(10): X, Y = sklearn.datasets.make_multilabel_classification( n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1, ) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:,] config = cs.sample_configuration() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities] except np.linalg.LinAlgError: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue