def test_predict_batched_sparse(self): dataset_properties = {'sparse': True} include = {'regressor': ['decision_tree']} cs = SimpleRegressionPipeline( dataset_properties=dataset_properties, include=include).get_hyperparameter_search_space() default = cs.get_default_configuration() regressor = SimpleRegressionPipeline( config=default, random_state=1, dataset_properties=dataset_properties, include=include) X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', make_sparse=True) regressor.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = regressor.predict(X_test_) mock_predict = unittest.mock.Mock( wraps=regressor.steps[-1][-1].predict) regressor.steps[-1][-1].predict = mock_predict prediction = regressor.predict(X_test, batch_size=20) self.assertEqual((356, ), prediction.shape) self.assertEqual(18, mock_predict.call_count) np.testing.assert_array_almost_equal(prediction_, prediction)
def test_configurations(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleRegressionPipeline.get_hyperparameter_search_space() print(cs) cs.seed(1) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['regressor:sgd:n_iter'] is not None: config._values['regressor:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') cls = SimpleRegressionPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleRegressionPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config[ 'classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', make_sparse=True) cls = SimpleRegressionPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_predict_batched(self): cs = SimpleRegressionPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleRegressionPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((356,), prediction.shape) self.assertEqual(18, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched(self): include = {'regressor': ['decision_tree']} cs = SimpleRegressionPipeline(include=include).get_hyperparameter_search_space() default = cs.get_default_configuration() regressor = SimpleRegressionPipeline(default, include=include) X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') regressor.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = regressor.predict(X_test_) mock_predict = unittest.mock.Mock(wraps=regressor.steps[-1][-1].predict) regressor.steps[-1][-1].predict = mock_predict prediction = regressor.predict(X_test, batch_size=20) self.assertEqual((356,), prediction.shape) self.assertEqual(18, mock_predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched(self): cs = SimpleRegressionPipeline.get_hyperparameter_search_space( include={'regressor': ['decision_tree']}) default = cs.get_default_configuration() cls = SimpleRegressionPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((292, ), prediction.shape) self.assertEqual(15, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_default_configuration(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') auto = SimpleRegressionPipeline() auto = auto.fit(X_train, Y_train) predictions = auto.predict(copy.deepcopy(X_test)) # The lower the worse r2_score = sklearn.metrics.r2_score(Y_test, predictions) self.assertAlmostEqual(0.417, r2_score, places=3) model_score = auto.score(copy.deepcopy(X_test), Y_test) self.assertAlmostEqual(model_score, r2_score, places=5)
def test_default_configuration(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') auto = SimpleRegressionPipeline() auto = auto.fit(X_train, Y_train) predictions = auto.predict(copy.deepcopy(X_test)) # The lower the worse r2_score = sklearn.metrics.r2_score(Y_test, predictions) self.assertAlmostEqual(0.339, r2_score, places=3) model_score = auto.score(copy.deepcopy(X_test), Y_test) self.assertAlmostEqual(model_score, r2_score, places=5)
def test_default_configuration(self): for i in range(2): cs = SimpleRegressionPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes') auto = SimpleRegressionPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(copy.deepcopy(X_test)) # The lower the worse r2_score = sklearn.metrics.r2_score(Y_test, predictions) self.assertAlmostEqual(0.41732302035060087, r2_score) model_score = auto.score(copy.deepcopy(X_test), Y_test) self.assertEqual(model_score, r2_score)
def test_pipeline_clonability(self): X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston') auto = SimpleRegressionPipeline(random_state=1) auto = auto.fit(X_train, Y_train) auto_clone = clone(auto) auto_clone_params = auto_clone.get_params() # Make sure all keys are copied properly for k, v in auto.get_params().items(): self.assertIn(k, auto_clone_params) # Make sure the params getter of estimator are honored klass = auto.__class__ new_object_params = auto.get_params(deep=False) for name, param in new_object_params.items(): new_object_params[name] = clone(param, safe=False) new_object = klass(**new_object_params) params_set = new_object.get_params(deep=False) for name in new_object_params: param1 = new_object_params[name] param2 = params_set[name] self.assertEqual(param1, param2)
def _test_configurations(self, configurations_space, make_sparse=False, data=None, dataset_properties=None): # Use a limit of ~4GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) configurations_space.seed(1) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = {'regressor:passive_aggressive:n_iter': 5, 'regressor:sgd:n_iter': 5, 'regressor:adaboost:n_estimators': 50, 'regressor:adaboost:max_depth': 1, 'preprocessor:kernel_pca:n_components': 10, 'preprocessor:kitchen_sinks:n_components': 50, 'regressor:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'preprocessor:truncatedSVD:target_dim': 10, 'preprocessor:polynomial:degree': 2, 'regressor:lda:n_components': 10} for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='boston', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() Y_test = data['Y_test'].copy() cls = SimpleRegressionPipeline(random_state=1, dataset_properties=dataset_properties) cls.set_hyperparameters(config) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except MemoryError as e: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except Exception as e: if "Multiple input features cannot have the same target value" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e
def _test_configurations(self, configurations_space, make_sparse=False, data=None, dataset_properties=None): # Use a limit of ~4GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) configurations_space.seed(1) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = { 'regressor:adaboost:n_estimators': 50, 'regressor:adaboost:max_depth': 1, 'feature_preprocessor:kernel_pca:n_components': 10, 'feature_preprocessor:kitchen_sinks:n_components': 50, 'regressor:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'regressor:libsvm_svr:C': 1., 'feature_preprocessor:truncatedSVD:target_dim': 10, 'feature_preprocessor:polynomial:degree': 2, 'regressor:lda:n_components': 10 } for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and config[ restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='boston', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() data['Y_test'].copy() cls = SimpleRegressionPipeline( random_state=1, dataset_properties=dataset_properties) cls.set_hyperparameters(config) # First make sure that for this configuration, setting the parameters # does not mistakenly set the estimator as fitted for name, step in cls.named_steps.items(): with self.assertRaisesRegex(sklearn.exceptions.NotFittedError, "instance is not fitted yet"): check_is_fitted(step) try: cls.fit(X_train, Y_train) # After fit, all components should be tagged as fitted # by sklearn. Check is fitted raises an exception if that # is not the case try: for name, step in cls.named_steps.items(): check_is_fitted(step) except sklearn.exceptions.NotFittedError: self.fail( "config={} raised NotFittedError unexpectedly!".format( config)) cls.predict(X_test) except MemoryError: continue except np.linalg.LinAlgError: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue elif 'The condensed distance matrix must contain only finite ' \ 'values.' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue elif "invalid value encountered in multiply" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except Exception as e: if "Multiple input features cannot have the same target value" in e.args[ 0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e
def _test_configurations(self, configurations_space, make_sparse=False, data=None, dataset_properties=None): # Use a limit of ~4GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) configurations_space.seed(1) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = {'regressor:adaboost:n_estimators': 50, 'regressor:adaboost:max_depth': 1, 'preprocessor:kernel_pca:n_components': 10, 'preprocessor:kitchen_sinks:n_components': 50, 'regressor:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'preprocessor:truncatedSVD:target_dim': 10, 'preprocessor:polynomial:degree': 2, 'regressor:lda:n_components': 10} for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='boston', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() Y_test = data['Y_test'].copy() cls = SimpleRegressionPipeline(random_state=1, dataset_properties=dataset_properties) cls.set_hyperparameters(config) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except MemoryError as e: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue elif 'The condensed distance matrix must contain only finite ' \ 'values.' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except Exception as e: if "Multiple input features cannot have the same target value" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e