def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default, 'qda') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['nystroem_sampler']}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
def test_default_configuration(self): for i in range(2): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = ParamSklearnClassifier(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_get_hyperparameter_search_space_preprocessor_contradicts_default_classifier( self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['densifier']}, dataset_properties={'sparse': True}) self.assertEqual( cs.get_hyperparameter('classifier:__choice__').default, 'qda') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['nystroem_sampler']}) self.assertEqual( cs.get_hyperparameter('classifier:__choice__').default, 'sgd')
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls = ParamSklearnClassifier(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def _get_classification_configuration_space(info, include): task_type = info['task'] multilabel = False multiclass = False sparse = False if task_type == MULTILABEL_CLASSIFICATION: multilabel = True if task_type == REGRESSION: raise NotImplementedError() if task_type == MULTICLASS_CLASSIFICATION: multiclass = True if task_type == BINARY_CLASSIFICATION: pass if info['is_sparse'] == 1: sparse = True dataset_properties = { 'multilabel': multilabel, 'multiclass': multiclass, 'sparse': sparse } return ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties=dataset_properties, include=include)
def _get_classification_configuration_space(info, include_estimators=None, include_preprocessors=None): task_type = info['task'] multilabel = False multiclass = False sparse = False if task_type == MULTILABEL_CLASSIFICATION: multilabel = True if task_type == REGRESSION: raise NotImplementedError() if task_type == MULTICLASS_CLASSIFICATION: multiclass = True pass if task_type == BINARY_CLASSIFICATION: pass if info['is_sparse'] == 1: sparse = True dataset_properties = { 'multilabel': multilabel, 'multiclass': multiclass, 'sparse': sparse } return ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties=dataset_properties, include_estimators=include_estimators, include_preprocessors=include_preprocessors)
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_get_hyperparameter_search_space_include_exclude_models(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': ['libsvm_svc']}) self.assertEqual(cs.get_hyperparameter('classifier:__choice__'), CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc'])) cs = ParamSklearnClassifier.get_hyperparameter_search_space( exclude={'classifier': ['libsvm_svc']}) self.assertNotIn('libsvm_svc', str(cs)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['select_percentile_classification']}) self.assertEqual(cs.get_hyperparameter('preprocessor:__choice__'), CategoricalHyperparameter('preprocessor:__choice__', ['select_percentile_classification'])) cs = ParamSklearnClassifier.get_hyperparameter_search_space( exclude={'preprocessor': ['select_percentile_classification']}) self.assertNotIn('select_percentile_classification', str(cs))
def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'multiclass': True}) self.assertNotIn('bernoulli_nb', str(cs_mc)) cs_ml = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'multilabel': True}) self.assertNotIn('k_nearest_neighbors', str(cs_ml)) self.assertNotIn('liblinear', str(cs_ml)) self.assertNotIn('libsvm_svc', str(cs_ml)) self.assertNotIn('sgd', str(cs_ml)) cs_sp = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) self.assertIn('extra_trees', str(cs_sp)) self.assertIn('gradient_boosting', str(cs_sp)) self.assertIn('random_forest', str(cs_sp)) cs_mc_ml = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'multilabel': True, 'multiclass': True}) self.assertEqual(cs_ml, cs_mc_ml)
def test_get_hyperparameter_search_space_include_exclude_models(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': ['libsvm_svc']}) self.assertEqual( cs.get_hyperparameter('classifier:__choice__'), CategoricalHyperparameter('classifier:__choice__', ['libsvm_svc'])) cs = ParamSklearnClassifier.get_hyperparameter_search_space( exclude={'classifier': ['libsvm_svc']}) self.assertNotIn('libsvm_svc', str(cs)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'preprocessor': ['select_percentile_classification']}) self.assertEqual( cs.get_hyperparameter('preprocessor:__choice__'), CategoricalHyperparameter('preprocessor:__choice__', ['select_percentile_classification'])) cs = ParamSklearnClassifier.get_hyperparameter_search_space( exclude={'preprocessor': ['select_percentile_classification']}) self.assertNotIn('select_percentile_classification', str(cs))
def test_get_hyperparameter_search_space_dataset_properties(self): cs_mc = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'multiclass': True}) self.assertNotIn('bernoulli_nb', str(cs_mc)) cs_ml = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'multilabel': True}) self.assertNotIn('k_nearest_neighbors', str(cs_ml)) self.assertNotIn('liblinear', str(cs_ml)) self.assertNotIn('libsvm_svc', str(cs_ml)) self.assertNotIn('sgd', str(cs_ml)) cs_sp = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) self.assertIn('extra_trees', str(cs_sp)) self.assertIn('gradient_boosting', str(cs_sp)) self.assertIn('random_forest', str(cs_sp)) cs_mc_ml = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={ 'multilabel': True, 'multiclass': True }) self.assertEqual(cs_ml, cs_mc_ml)
def test_default_configuration(self): for i in range(2): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = ParamSklearnClassifier(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual( 0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_get_hyperparameter_search_space(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() self.assertEqual(len(cs.get_hyperparameter( 'rescaling:__choice__').choices), 4) self.assertEqual(len(cs.get_hyperparameter( 'classifier:__choice__').choices), 16) self.assertEqual(len(cs.get_hyperparameter( 'preprocessor:__choice__').choices), 14) hyperparameters = cs.get_hyperparameters() self.assertEqual(145, len(hyperparameters)) #for hp in sorted([str(h) for h in hyperparameters]): # print hp # The four parameters which are always active are classifier, # preprocessor, imputation strategy and scaling strategy self.assertEqual(len(hyperparameters) - 6, len(conditions))
def test_get_hyperparameter_search_space(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() self.assertEqual( len(cs.get_hyperparameter('rescaling:__choice__').choices), 4) self.assertEqual( len(cs.get_hyperparameter('classifier:__choice__').choices), 16) self.assertEqual( len(cs.get_hyperparameter('preprocessor:__choice__').choices), 14) hyperparameters = cs.get_hyperparameters() self.assertEqual(145, len(hyperparameters)) #for hp in sorted([str(h) for h in hyperparameters]): # print hp # The four parameters which are always active are classifier, # preprocessor, imputation strategy and scaling strategy self.assertEqual(len(hyperparameters) - 6, len(conditions))
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) categorical = [True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:,:-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) cls = ParamSklearnClassifier(config, random_state=1,) try: cls.fit(X_train, Y_train, init_params={'one_hot_encoding:categorical_features': categorical}) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True ] this_directory = os.path.dirname(__file__) X = np.loadtxt( os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) cls = ParamSklearnClassifier( config, random_state=1, ) try: cls.fit(X_train, Y_train, init_params={ 'one_hot_encoding:categorical_features': categorical }) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_weighting_effect(self): for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.692, 0.719), ('decision_tree', DecisionTree, 0.712, 0.668), ('extra_trees', ExtraTreesClassifier, 0.901, 0.919), ('gradient_boosting', GradientBoostingClassifier, 0.879, 0.883), ('random_forest', RandomForest, 0.886, 0.885), ('libsvm_svc', LibSVM_SVC, 0.915, 0.937), ('liblinear_svc', LibLinear_SVC, 0.920, 0.923), ('sgd', SGD, 0.811, 0.902)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3) # pre_transform and fit_estimator X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, fit_params=fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessor, 0.892, 0.910), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.906, 0.909)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={ 'classifier': ['sgd'], 'preprocessor': [name] }) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3) # pre_transform and fit_estimator X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={ 'classifier': ['sgd'], 'preprocessor': [name] }) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, fit_params=fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3)
D = autosklearn.data.competition_data_manager.CompetitionDataManager(path) X = D.data['X_train'] y = D.data['Y_train'] X_valid = D.data['X_valid'] X_test = D.data['X_test'] # Replace the following array by a new ensemble choices = \ [(1.0, ParamSklearnClassifier(configuration={ 'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': True, 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': True, 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'}))] classifiers = [] targets = [] predictions = [] predictions_valid = [] predictions_test = [] # Make predictions and weight them
def test_predict_batched(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = ParamSklearnClassifier(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {'multilabel': True} cs = ParamSklearnClassifier.get_hyperparameter_search_space(dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(50): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config: config._values['classifier:sgd:n_iter'] = 5 cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities] except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_predict_proba_batched(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = ParamSklearnClassifier(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched_sparse(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max" }) # Multiclass cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_repr(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() representation = repr(ParamSklearnClassifier(default)) cls = eval(representation) self.assertIsInstance(cls, ParamSklearnClassifier)
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {'multilabel': True} cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(50): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config: config._values['classifier:sgd:n_iter'] = 5 cls = ParamSklearnClassifier(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [ self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities ] except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_predict_batched(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = ParamSklearnClassifier(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
y = D.data['Y_train'] X_valid = D.data['X_valid'] X_test = D.data['X_test'] # Replace the following array by a new ensemble choices = \ [(0.480000, ParamSklearnClassifier(configuration={ 'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'entropy', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 4.885151102990943, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:n_estimators': 100, 'imputation:strategy': 'median', 'one_hot_encoding:minimum_fraction': 0.059297498551361, 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'gem', 'preprocessor:gem:N': 13, 'preprocessor:gem:precond': 0.31299029323203487, 'rescaling:__choice__': 'min/max'})), (0.300000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'entropy',
def test_predict_proba_batched_sparse(self): cs = ParamSklearnClassifier.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max"}) # Multiclass cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = ParamSklearnClassifier(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
y = D.data['Y_train'] X_valid = D.data['X_valid'] X_test = D.data['X_test'] # Replace the following array by a new ensemble choices = \ [(0.580000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'weighting', 'classifier:__choice__': 'extra_trees', 'classifier:extra_trees:bootstrap': 'True', 'classifier:extra_trees:criterion': 'gini', 'classifier:extra_trees:max_depth': 'None', 'classifier:extra_trees:max_features': 1.4927328322706173, 'classifier:extra_trees:min_samples_leaf': 1, 'classifier:extra_trees:min_samples_split': 5, 'classifier:extra_trees:min_weight_fraction_leaf': 0.0, 'classifier:extra_trees:n_estimators': 100, 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'select_rates', 'preprocessor:select_rates:alpha': 0.4308279694614349, 'preprocessor:select_rates:mode': 'fwe', 'preprocessor:select_rates:score_func': 'f_classif', 'rescaling:__choice__': 'min/max'})), (0.200000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'none', 'classifier:__choice__': 'sgd', 'classifier:sgd:alpha': 5.707045187542232e-06, 'classifier:sgd:average': 'True',
from ParamSklearn.classification import ParamSklearnClassifier from HPOlibConfigSpace.random_sampler import RandomSampler import sklearn.datasets import sklearn.metrics import numpy as np iris = sklearn.datasets.load_iris() X = iris.data Y = iris.target indices = np.arange(X.shape[0]) np.random.shuffle(indices) configuration_space = ParamSklearnClassifier.get_hyperparameter_search_space() sampler = RandomSampler(configuration_space, 1) for i in range(10000): configuration = sampler.sample_configuration() auto = ParamSklearnClassifier(configuration) try: auto = auto.fit(X[indices[:100]], Y[indices[:100]]) except Exception as e: print configuration print e continue predictions = auto.predict(X[indices[100:]]) print sklearn.metrics.accuracy_score(predictions, Y[indices[100:]])
classifiers = [] predictions_valid = [] predictions_test = [] # Make predictions and weight them for weight, configuration in zip(weights, configurations): for param in configuration: try: configuration[param] = int(configuration[param]) except Exception: try: configuration[param] = float(configuration[param]) except Exception: pass classifier = ParamSklearnClassifier(configuration, 1) classifiers.append(classifier) try: classifier.fit(X.copy(), y.copy()) predictions_valid.append( classifier.predict_proba(X_valid.copy()) * weight) predictions_test.append( classifier.predict_proba(X_test.copy()) * weight) except Exception as e: print e print configuration # Output the predictions for name, predictions in [('valid', predictions_valid), ('test', predictions_test)]: predictions = np.array(predictions)
def test_weighting_effect(self): for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.692, 0.719), ('decision_tree', DecisionTree, 0.712, 0.668), ('extra_trees', ExtraTreesClassifier, 0.901, 0.919), ('gradient_boosting', GradientBoostingClassifier, 0.879, 0.883), ('random_forest', RandomForest, 0.886, 0.885), ('libsvm_svc', LibSVM_SVC, 0.915, 0.937), ('liblinear_svc', LibLinear_SVC, 0.920, 0.923), ('sgd', SGD, 0.811, 0.902)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score(predictions, Y_test), places=3) # pre_transform and fit_estimator X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, fit_params=fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessor, 0.892, 0.910), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.906, 0.909)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3) # pre_transform and fit_estimator X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cs = ParamSklearnClassifier.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = ParamSklearnClassifier(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, fit_params=fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.accuracy_score( predictions, Y_test), places=3)
X_valid = D.data['X_valid'] X_test = D.data['X_test'] # Replace the following array by a new ensemble choices = \ [(0.140000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 4.649151092701434, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_samples_leaf': 3, 'classifier:random_forest:min_samples_split': 5, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:n_estimators': 100, 'imputation:strategy': 'most_frequent', 'one_hot_encoding:minimum_fraction': 0.006861808529548735, 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'select_rates', 'preprocessor:select_rates:alpha': 0.03408255008474342, 'preprocessor:select_rates:mode': 'fwe', 'preprocessor:select_rates:score_func': 'f_classif', 'rescaling:__choice__': 'normalize'})), (0.100000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'weighting', 'classifier:__choice__': 'random_forest', 'classifier:random_forest:bootstrap': 'False',
classifiers = [] predictions_valid = [] predictions_test = [] # Make predictions and weight them for weight, configuration in zip(weights, configurations): for param in configuration: try: configuration[param] = int(configuration[param]) except Exception: try: configuration[param] = float(configuration[param]) except Exception: pass classifier = ParamSklearnClassifier(configuration, 1) classifiers.append(classifier) try: classifier.fit(X.copy(), y.copy()) predictions_valid.append(classifier.predict_proba(X_valid.copy()) * weight) predictions_test.append(classifier.predict_proba(X_test.copy()) * weight) except Exception as e: print e print configuration # Output the predictions for name, predictions in [('valid', predictions_valid), ('test', predictions_test)]: predictions = np.array(predictions) predictions = np.sum(predictions, axis=0) predictions = predictions[:, 1].reshape((-1, 1))
def get_model(configuration, seed): if 'classifier' in configuration: return ParamSklearnClassifier(configuration, seed) elif 'regressor' in configuration: return ParamSklearnRegressor(configuration, seed)
D = autosklearn.data.competition_data_manager.CompetitionDataManager(path) X = D.data['X_train'] y = D.data['Y_train'] X_valid = D.data['X_valid'] X_test = D.data['X_test'] # Replace the following array by a new ensemble choices = \ [(0.220000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'weighting', 'classifier:__choice__': 'passive_aggressive', 'classifier:passive_aggressive:C': 0.0022574783522003694, 'classifier:passive_aggressive:fit_intercept': 'True', 'classifier:passive_aggressive:loss': 'hinge', 'classifier:passive_aggressive:n_iter': 119, 'imputation:strategy': 'most_frequent', 'one_hot_encoding:minimum_fraction': 0.1898871876010834, 'one_hot_encoding:use_minimum_fraction': 'True', 'preprocessor:__choice__': 'gem', 'preprocessor:gem:N': 20, 'preprocessor:gem:precond': 0.27540716190663134, 'rescaling:__choice__': 'min/max'})), (0.160000, ParamSklearnClassifier( configuration={ 'balancing:strategy': 'none', 'classifier:__choice__': 'passive_aggressive', 'classifier:passive_aggressive:C': 8.011168723835382, 'classifier:passive_aggressive:fit_intercept': 'True', 'classifier:passive_aggressive:loss': 'hinge', 'classifier:passive_aggressive:n_iter': 20,