def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}) default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['decision_tree']}, dataset_properties={'sparse': True}) config = cs.get_default_configuration() cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, list) self.assertEqual(2, len(prediction)) self.assertEqual((1647, 10), prediction[0].shape) self.assertEqual((1647, 10), prediction[1].shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def test_predict_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleClassificationPipeline(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits") Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def get_models_performance_by_data(input): X = input[0] y = input[1] probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) model = random_model() train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] kf = KFold(n_splits=5, random_state=1, shuffle=True) time_start = time.time() for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance = { "train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss": np.mean(train_log_loss), "test_log_loss": np.mean(test_log_loss), "duration": duration / 5 } return models_performance
def test_configurations_signed_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'signed': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabiliets = cls.predict_proba(X_test_) self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def get_performance_of_encoded_model(data_set, encoded_model, verbose=False): """ Get model performance array(4 * 1) from encoded model vector(17 * 1) data_set : (X, y) input dataset to get performance encoded_model : encoded model choice vector (17 * 1) verbose : if True, will log model choice dictionary and model performance array return : model performance vector(4 * 1) """ train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] X, y = data_set #kf = KFold(n_splits=5, random_state=1, shuffle=True) model = decode_model(encoded_model) if verbose: print('Model choice: {0}'.format(model)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) model_performance = np.array([np.mean(train_accuracy_score), np.mean(test_accuracy_score), np.mean(train_log_loss), np.mean(test_log_loss)]) if verbose: print('Model Performance: {o}'.format(model_performance)) return model_performance
def test_predict_proba_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() # Multiclass cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train_ = np.zeros((Y_train.shape[0], 10)) for i, y in enumerate(Y_train): Y_train_[i][y] = 1 Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() cls = SimpleClassificationPipeline(default) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): # Multiclass cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(include={'classifier': ['lda']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_proba_batched(self): # Multiclass cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(include={'classifier': ['lda']}) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_fit_instantiates_component(self): """Make sure that if a preprocessor is added, it's fit method is called""" preprocessing_components.add_preprocessor(CrashPreprocessor) # We reduce the search space as forbidden clauses prevent to instantiate # the user defined preprocessor manually cls = SimpleClassificationPipeline( include={'classifier': ['random_forest']}) cs = cls.get_hyperparameter_search_space() self.assertIn('CrashPreprocessor', str(cs)) config = cs.sample_configuration() try: config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' except Exception as e: # In case of failure clean up the components and print enough information # to clean up with check in the future del preprocessing_components._addons.components[ 'CrashPreprocessor'] self.fail("cs={} config={} Exception={}".format(cs, config, e)) cls.set_hyperparameters(config) with self.assertRaisesRegex(ValueError, "Make sure fit is called"): cls.fit(X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), y=np.array([1, 0, 1, 1])) del preprocessing_components._addons.components['CrashPreprocessor']
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max" }) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array( list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def get_models_performance(reproduce_num, data_set_idx): ''' reproduce_num: the number of model choices for the dataset to reproduce data_set_idx: generated data set index, will load tried models for the dataset json file return: reproduced models performance json file ''' X = np.loadtxt('Data_Set/X_' + str(data_set_idx)) y = np.loadtxt('Data_Set/y_' + str(data_set_idx)) probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) tried_models_filename = "./log/classifier_log" + str( data_set_idx) + "/tried_models_for_dataset" + str( data_set_idx) + ".json" models_performance = {} # duration = get_training_duration(data_set_idx) with open(tried_models_filename) as fp: models = json.load(fp) reproduce_num_act = min(len(models), reproduce_num) for i in range(1, reproduce_num_act + 1): model = models[str(i)] #print(model) train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] #kf = KFold(n_splits=5, random_state=1, shuffle=True) time_start = time.time() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42, shuffle=True) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance[i] = { "train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss": np.mean(train_log_loss), "test_log_loss": np.mean(test_log_loss), "duration": duration } #if i in duration: # models_performance[i]["duration"] = duration[i] repreduce_performance_json_filename = "./log/classifier_log" + str( data_set_idx) + "/reproduce_models_performance" + str( data_set_idx) + ".json" with open(repreduce_performance_json_filename, 'w') as fp: json.dump(models_performance, fp) return models_performance
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": 'True', "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, "rescaling:__choice__": "min/max"}) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train_ = np.zeros((Y_train.shape[0], 10)) for i, y in enumerate(Y_train): Y_train_[i][y] = 1 Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_predict_proba_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", "classifier:random_forest:bootstrap": "True", "classifier:random_forest:criterion": "gini", "classifier:random_forest:max_depth": "None", "classifier:random_forest:min_samples_split": 2, "classifier:random_forest:min_samples_leaf": 2, "classifier:random_forest:min_weight_fraction_leaf": 0.0, "classifier:random_forest:max_features": 0.5, "classifier:random_forest:max_leaf_nodes": "None", "classifier:random_forest:n_estimators": 100, "rescaling:__choice__": "min/max", }, ) # Multiclass cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train])) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration( cs, values={ "balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, "rescaling:__choice__": "min/max" }) cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, ), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def get_performance_of_range_encoded_models(data_set_idx, encoded_all_model_hyperparameters, json_model, verbose=False): """ Get models performance (30 * 5) from encoded model choice matrix (30 * 38) """ X = np.loadtxt('Data_Set/X_' + str(data_set_idx)) y = np.loadtxt('Data_Set/y_' + str(data_set_idx)) probas = np.loadtxt('Data_Set/probas_' + str(data_set_idx)) # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) models_performance = {} #get_performance_of_encoded_model([X, y], encoded_all_model_hyperparameters[0]) for i in range(len(encoded_all_model_hyperparameters)): #model = models[str(i)] encoded_model = encoded_all_model_hyperparameters[i] model = decode_model(encoded_model) if verbose: print('Original json model: ', json_model[str(i+1)]) print('Encoded model: ', encoded_model) print('Decoded model:' , model) print("==========================================================") train_accuracy_score = [] test_accuracy_score = [] train_log_loss = [] test_log_loss = [] #kf = KFold(n_splits=5, random_state=1) time_start = time.time() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True) p = SimpleClassificationPipeline(config=model) p.fit(X_train, y_train) #scores = sklearn.model_selection.cross_validate(p, X, y, scoring=scoring, cv=5, return_train_score=True) #print(scores) y_train_pred = p.predict(X_train) y_test_pred = p.predict(X_test) train_accuracy_score.append(accuracy_score(y_train, y_train_pred)) test_accuracy_score.append(accuracy_score(y_test, y_test_pred)) train_log_loss.append(log_loss(y_train, y_train_pred)) test_log_loss.append(log_loss(y_test, y_test_pred)) time_end = time.time() duration = time_end - time_start models_performance[i] = {"train_accuracy_score": np.mean(train_accuracy_score), "test_accuracy_score": np.mean(test_accuracy_score), "train_log_loss" : np.mean(train_log_loss), "test_log_loss" : np.mean(test_log_loss), "duration" : duration} performance_json_filename = "./log/classifier_log" + str(data_set_idx) + "/reproduce_models_performance" + str(data_set_idx) + ".json" with open(performance_json_filename, 'w') as fp: json.dump(models_performance, fp) return models_performance
def test_predict_batched(self): cls = SimpleClassificationPipeline(include={'classifier': ['sgd']}) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) np.testing.assert_array_almost_equal(prediction_, prediction)
def test_predict_batched_sparse(self): cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) config = Configuration(cs, values={"balancing:strategy": "none", "classifier:__choice__": "random_forest", "imputation:strategy": "mean", "one_hot_encoding:minimum_fraction": 0.01, "one_hot_encoding:use_minimum_fraction": "True", "preprocessor:__choice__": "no_preprocessing", 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:n_estimators': 100, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, "rescaling:__choice__": "min/max"}) cls = SimpleClassificationPipeline(config) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction) # Multilabel X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) Y_train = np.array([(y, 26 - y) for y in Y_train]) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 2), prediction.shape) self.assertEqual(83, cls_predict.predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_predict_batched_sparse(self): cls = SimpleClassificationPipeline(dataset_properties={'sparse': True}, include={'classifier': ['sgd']}) # Multiclass X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba) cls.steps[-1][-1].predict_proba = cls_predict prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) self.assertEqual(84, cls_predict.call_count) assert_array_almost_equal(prediction_, prediction)
def test_default_configuration(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline() auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.96, sklearn.metrics.accuracy_score(predictions, Y_test)) auto.predict_proba(X_test)
def test_default_configuration(self): for i in range(2): X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline() auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.94, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_fit_instantiates_component(self): """Make sure that if a preprocessor is added, it's fit method is called""" preprocessing_components.add_preprocessor(CrashPreprocessor) cls = SimpleClassificationPipeline() cs = cls.get_hyperparameter_search_space() self.assertIn('CrashPreprocessor', str(cs)) config = cs.sample_configuration() config['feature_preprocessor:__choice__'] = 'CrashPreprocessor' cls.set_hyperparameters(config) with self.assertRaisesRegex( ValueError, "Make sure fit is called" ): cls.fit( X=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), y=np.array([1, 0, 1, 1]) ) del preprocessing_components._addons.components['CrashPreprocessor']
def test_default_configuration_multilabel(self): for i in range(2): cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True}) default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris", make_multilabel=True) auto = SimpleClassificationPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_default_configuration(self): for i in range(2): cs = SimpleClassificationPipeline.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline(default) auto = auto.fit(X_train, Y_train) predictions = auto.predict(X_test) self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = auto.predict_proba(X_test)
def test_default_configuration_multilabel(self): for i in range(2): classifier = SimpleClassificationPipeline( random_state=1, dataset_properties={'multilabel': True}) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset( dataset='iris', make_multilabel=True) classifier.set_hyperparameters(default) classifier = classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) self.assertAlmostEqual( 0.96, sklearn.metrics.accuracy_score(predictions, Y_test)) classifier.predict_proba(X_test)
def max_estimators_fit_duration(X,y,max_classifier_time_budget,sample_factor=1): p("constructing preprocessor pipeline and transforming sample dataset") # we don't care about the data here but need to preprocess, otherwise the classifiers crash default_cs = SimpleClassificationPipeline( ).get_hyperparameter_search_space( # include={ 'imputation': 'most_frequent' # , 'rescaling': 'standardize' } ).get_default_configuration() preprocessor = SimpleClassificationPipeline(default_cs, random_state=42) preprocessor.fit(X,y) X_tr,dummy = preprocessor.pre_transform(X,y) p("running estimators on a subset") # going over all default classifiers used by auto-sklearn clfs=autosklearn.pipeline.components.classification._classifiers processes = [] with multiprocessing.Manager() as manager: max_clf_time=manager.Value('i',3) # default 3 sec for clf_name,clf_class in clfs.items() : pr = multiprocessing.Process( target=time_single_estimator, name=clf_name , args=(clf_name, clf_class, X_tr, y, max_clf_time)) pr.start() processes.append(pr) for pr in processes: pr.join(max_classifier_time_budget) # will block for max_classifier_time_budget or # until the classifier fit process finishes. After max_classifier_time_budget # we will terminate all still running processes here. if pr.is_alive(): p("terminating "+pr.name+" process due to timeout") pr.terminate() result_max_clf_time=max_clf_time.value p("test classifier fit completed") per_run_time_limit = int(sample_factor*result_max_clf_time) return max_classifier_time_budget if per_run_time_limit > max_classifier_time_budget else per_run_time_limit
def test_default_configuration_multilabel(self): for i in range(2): dataset_properties = {'multilabel': True} classifier = SimpleClassificationPipeline( dataset_properties=dataset_properties) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris', make_multilabel=True) classifier.set_hyperparameters(default) classifier = classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) self.assertAlmostEqual(0.94, sklearn.metrics.accuracy_score(predictions, Y_test)) scores = classifier.predict_proba(X_test)
def test_pipeline_clonability(self): X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris') auto = SimpleClassificationPipeline() auto = auto.fit(X_train, Y_train) auto_clone = clone(auto) auto_clone_params = auto_clone.get_params() # Make sure all keys are copied properly for k, v in auto.get_params().items(): self.assertIn(k, auto_clone_params) # Make sure the params getter of estimator are honored klass = auto.__class__ new_object_params = auto.get_params(deep=False) for name, param in new_object_params.items(): new_object_params[name] = clone(param, safe=False) new_object = klass(**new_object_params) params_set = new_object.get_params(deep=False) for name in new_object_params: param1 = new_object_params[name] param2 = params_set[name] self.assertEqual(param1, param2)
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 print(config) categorical = [True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:,:-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) cls = SimpleClassificationPipeline(config, random_state=1,) try: cls.fit(X_train, Y_train, init_params={'one_hot_encoding:categorical_features': categorical}) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0] or \ "removed all features" in e.args[0] or \ "all features are discarded" in e.args[0]: continue else: print(config) traceback.print_tb(sys.exc_info()[2]) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def _test_configurations(self, configurations_space, make_sparse=False, data=None, init_params=None, dataset_properties=None): # Use a limit of ~3GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) print(configurations_space) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = {'classifier:passive_aggressive:n_iter': 5, 'classifier:sgd:n_iter': 5, 'classifier:adaboost:n_estimators': 50, 'classifier:adaboost:max_depth': 1, 'feature_preprocessor:kernel_pca:n_components': 10, 'feature_preprocessor:kitchen_sinks:n_components': 50, 'classifier:proj_logit:max_epochs': 1, 'classifier:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'feature_preprocessor:truncatedSVD:target_dim': 10, 'feature_preprocessor:polynomial:degree': 2, 'classifier:lda:n_components': 10, 'feature_preprocessor:nystroem_sampler:n_components': 50, 'feature_preprocessor:feature_agglomeration:n_clusters': 2, 'classifier:gradient_boosting:max_leaf_nodes': 64} for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to print(config) if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() data['Y_test'].copy() init_params_ = copy.deepcopy(init_params) cls = SimpleClassificationPipeline(random_state=1, dataset_properties=dataset_properties, init_params=init_params_,) cls.set_hyperparameters(config, init_params=init_params_) # First make sure that for this configuration, setting the parameters # does not mistakenly set the estimator as fitted for name, step in cls.named_steps.items(): with self.assertRaisesRegex(sklearn.exceptions.NotFittedError, "instance is not fitted yet"): check_is_fitted(step) try: cls.fit(X_train, Y_train) # After fit, all components should be tagged as fitted # by sklearn. Check is fitted raises an exception if that # is not the case try: for name, step in cls.named_steps.items(): check_is_fitted(step) except sklearn.exceptions.NotFittedError: self.fail("config={} raised NotFittedError unexpectedly!".format( config )) cls.predict(X_test.copy()) cls.predict_proba(X_test) except MemoryError: continue except np.linalg.LinAlgError: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue elif 'The condensed distance matrix must contain only finite ' \ 'values.' in e.args[0]: continue elif 'Internal work array size computation failed' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "invalid value encountered in multiply" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue elif "invalid value encountered in multiply" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {"multilabel": True} cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(10): X, Y = sklearn.datasets.make_multilabel_classification( n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1, ) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:,] config = cs.sample_configuration() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities] except np.linalg.LinAlgError: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def _test_configurations(self, configurations_space, make_sparse=False, data=None, init_params=None, dataset_properties=None): # Use a limit of ~3GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = { 'classifier:passive_aggressive:n_iter': 5, 'classifier:sgd:n_iter': 5, 'classifier:adaboost:n_estimators': 50, 'classifier:adaboost:max_depth': 1, 'preprocessor:kernel_pca:n_components': 10, 'preprocessor:kitchen_sinks:n_components': 50, 'classifier:proj_logit:max_epochs': 1, 'classifier:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'preprocessor:truncatedSVD:target_dim': 10, 'preprocessor:polynomial:degree': 2, 'classifier:lda:n_components': 10, 'preprocessor:nystroem_sampler:n_components': 50, 'preprocessor:feature_agglomeration:n_clusters': 2 } for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() Y_test = data['Y_test'].copy() init_params_ = copy.deepcopy(init_params) cls = SimpleClassificationPipeline( random_state=1, dataset_properties=dataset_properties, init_params=init_params_) cls.set_hyperparameters(config) try: cls.fit( X_train, Y_train, ) predictions = cls.predict(X_test) except MemoryError as e: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue elif 'The condensed distance matrix must contain only finite ' \ 'values.' in e.args[0]: continue elif 'which is larger than the original space with n_features='\ in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.709, 0.658), ('decision_tree', DecisionTree, 0.683, 0.701), ('extra_trees', ExtraTreesClassifier, 0.812, 0.8), ('gradient_boosting', GradientBoostingClassifier, 0.800, 0.760), ('random_forest', RandomForest, 0.846, 0.792), ('libsvm_svc', LibSVM_SVC, 0.571, 0.658), ('liblinear_svc', LibLinear_SVC, 0.685, 0.699), ('sgd', SGD, 0.65384615384615385, 0.38795986622073581)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.\ get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score(predictions, Y_test), places=3) # pre_transform and fit_estimator data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.7142857142857143, 0.72180451127819545), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.5934065934065933, 0.71111111111111114)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3) # pre_transform and fit_estimator data_ = copy.copy(data) X_train = data_[0][:700] Y_train = data_[1][:700] X_test = data_[0][700:] Y_test = data_[1][700:] cs = SimpleClassificationPipeline.get_hyperparameter_search_space( include={'classifier': ['sgd'], 'preprocessor': [name]}) default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline(default, random_state=1) Xt, fit_params = classifier.pre_transform(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual(acc, sklearn.metrics.f1_score( predictions, Y_test), places=3)
def test_configurations_sparse(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 print(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if ( "classifier:passive_aggressive:n_iter" in config and config["classifier:passive_aggressive:n_iter"] is not None ): config._values["classifier:passive_aggressive:n_iter"] = 5 if "classifier:sgd:n_iter" in config and config["classifier:sgd:n_iter"] is not None: config._values["classifier:sgd:n_iter"] = 5 if "classifier:adaboost:n_estimators" in config and config["classifier:adaboost:n_estimators"] is not None: config._values["classifier:adaboost:n_estimators"] = 50 if "classifier:adaboost:max_depth" in config and config["classifier:adaboost:max_depth"] is not None: config._values["classifier:adaboost:max_depth"] = 1 print(config) categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, ] this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, y) cls = SimpleClassificationPipeline(config, random_state=1) try: cls.fit(X_train, Y_train, init_params={"one_hot_encoding:categorical_features": categorical}) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif "Bug in scikit-learn" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def _test_configurations(self, configurations_space, make_sparse=False, data=None, init_params=None, dataset_properties=None): # Use a limit of ~3GiB limit = 3072 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) print(configurations_space) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = {'classifier:passive_aggressive:n_iter': 5, 'classifier:sgd:n_iter': 5, 'classifier:adaboost:n_estimators': 50, 'classifier:adaboost:max_depth': 1, 'preprocessor:kernel_pca:n_components': 10, 'preprocessor:kitchen_sinks:n_components': 50, 'classifier:proj_logit:max_epochs': 1, 'classifier:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2, 'preprocessor:truncatedSVD:target_dim': 10, 'preprocessor:polynomial:degree': 2, 'classifier:lda:n_components': 10, 'preprocessor:nystroem_sampler:n_components': 50, 'preprocessor:feature_agglomeration:n_clusters': 2, 'classifier:gradient_boosting:max_depth': 2, 'classifier:gradient_boosting:n_estimators': 50} for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to print(config) if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits', make_sparse=make_sparse, add_NaNs=True) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() Y_test = data['Y_test'].copy() init_params_ = copy.deepcopy(init_params) cls = SimpleClassificationPipeline(random_state=1, dataset_properties=dataset_properties, init_params=init_params_,) cls.set_hyperparameters(config, init_params=init_params_) try: cls.fit(X_train, Y_train, ) predictions = cls.predict(X_test.copy()) predictions = cls.predict_proba(X_test) except MemoryError as e: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue elif 'The condensed distance matrix must contain only finite ' \ 'values.' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(traceback.format_exc()) print(config) raise e
def test_configurations_categorical_data(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties={'sparse': True}) print(cs) for i in range(10): config = cs.sample_configuration() config._populate_values() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 if 'classifier:adaboost:n_estimators' in config and \ config['classifier:adaboost:n_estimators'] is not None: config._values['classifier:adaboost:n_estimators'] = 50 if 'classifier:adaboost:max_depth' in config and \ config['classifier:adaboost:max_depth'] is not None: config._values['classifier:adaboost:max_depth'] = 1 print(config) categorical = [ True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True ] this_directory = os.path.dirname(__file__) X = np.loadtxt( os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) y = X[:, -1].copy() X = X[:, :-1] X_train, X_test, Y_train, Y_test = \ sklearn.cross_validation.train_test_split(X, y) cls = SimpleClassificationPipeline( config, random_state=1, ) try: cls.fit(X_train, Y_train, init_params={ 'one_hot_encoding:categorical_features': categorical }) predictions = cls.predict(X_test) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_multilabel(self): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) dataset_properties = {'multilabel': True} cs = SimpleClassificationPipeline.get_hyperparameter_search_space( dataset_properties=dataset_properties) print(cs) cs.seed(5) for i in range(10): X, Y = sklearn.datasets.\ make_multilabel_classification(n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] config = cs.sample_configuration() if 'classifier:passive_aggressive:n_iter' in config and \ config['classifier:passive_aggressive:n_iter'] is not None: config._values['classifier:passive_aggressive:n_iter'] = 5 if 'classifier:sgd:n_iter' in config and \ config['classifier:sgd:n_iter'] is not None: config._values['classifier:sgd:n_iter'] = 5 if 'classifier:adaboost:n_estimators' in config and \ config['classifier:adaboost:n_estimators'] is not None: config._values['classifier:adaboost:n_estimators'] = 50 if 'classifier:adaboost:max_depth' in config and \ config['classifier:adaboost:max_depth'] is not None: config._values['classifier:adaboost:max_depth'] = 1 cls = SimpleClassificationPipeline(config, random_state=1) print(config) try: cls.fit(X_train, Y_train) X_test_ = X_test.copy() predictions = cls.predict(X_test) self.assertIsInstance(predictions, np.ndarray) predicted_probabilities = cls.predict_proba(X_test_) [ self.assertIsInstance(i, np.ndarray) for i in predicted_probabilities ] except np.linalg.LinAlgError: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except MemoryError as e: continue
def _test_configurations(self, configurations_space, make_sparse=False, data=None, init_params=None): # Use a limit of ~4GiB limit = 4000 * 1024 * 1024 resource.setrlimit(resource.RLIMIT_AS, (limit, limit)) print(configurations_space) for i in range(10): config = configurations_space.sample_configuration() config._populate_values() # Restrict configurations which could take too long on travis-ci restrictions = { 'classifier:passive_aggressive:n_iter': 5, 'classifier:sgd:n_iter': 5, 'classifier:adaboost:n_estimators': 50, 'classifier:adaboost:max_depth': 1, 'preprocessor:kernel_pca:n_components': 10, 'preprocessor:kitchen_sinks:n_components': 50, 'preprocessor:gem:N': 5, 'classifier:proj_logit:max_epochs': 1, 'classifier:libsvm_svc:degree': 2, 'regressor:libsvm_svr:degree': 2 } for restrict_parameter in restrictions: restrict_to = restrictions[restrict_parameter] if restrict_parameter in config and \ config[restrict_parameter] is not None: config._values[restrict_parameter] = restrict_to print(config) if data is None: X_train, Y_train, X_test, Y_test = get_dataset( dataset='digits', make_sparse=make_sparse) else: X_train = data['X_train'].copy() Y_train = data['Y_train'].copy() X_test = data['X_test'].copy() Y_test = data['Y_test'].copy() cls = SimpleClassificationPipeline(config, random_state=1) try: init_params_ = copy.deepcopy(init_params) cls.fit(X_train, Y_train, init_params=init_params_) predictions = cls.predict(X_test) except MemoryError as e: continue except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ e.args[0]: continue elif "removed all features" in e.args[0]: continue elif "all features are discarded" in e.args[0]: continue elif "Numerical problems in QDA" in e.args[0]: continue elif 'Bug in scikit-learn' in e.args[0]: continue else: print(config) print(traceback.format_exc()) raise e except RuntimeWarning as e: if "invalid value encountered in sqrt" in e.args[0]: continue elif "divide by zero encountered in" in e.args[0]: continue elif "invalid value encountered in divide" in e.args[0]: continue elif "invalid value encountered in true_divide" in e.args[0]: continue else: print(config) raise e except UserWarning as e: if "FastICA did not converge" in e.args[0]: continue else: print(config) raise e
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=200, n_features=10, n_redundant=2, n_informative=2, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting in \ [('adaboost', AdaboostClassifier, 0.810, 0.735), ('decision_tree', DecisionTree, 0.780, 0.643), ('extra_trees', ExtraTreesClassifier, 0.75, 0.800), ('gradient_boosting', GradientBoostingClassifier, 0.789, 0.762), ('random_forest', RandomForest, 0.75, 0.821), ('libsvm_svc', LibSVM_SVC, 0.769, 0.72), ('liblinear_svc', LibLinear_SVC, 0.762, 0.735), ('sgd', SGD, 0.704, 0.667) ]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: # Fit data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': [name], 'preprocessor': ['no_preprocessing']} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] classifier = SimpleClassificationPipeline( default, random_state=1, include=include) classifier.set_hyperparameters(configuration=default) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.691, 0.692), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.692, 0.590)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': ['sgd'], 'preprocessor': [name]} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier.set_hyperparameters(default) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3)
def test_weighting_effect(self): data = sklearn.datasets.make_classification( n_samples=200, n_features=10, n_redundant=2, n_informative=2, n_repeated=2, n_clusters_per_class=2, weights=[0.8, 0.2], random_state=1) for name, clf, acc_no_weighting, acc_weighting, places in \ [('adaboost', AdaboostClassifier, 0.810, 0.735, 3), ('decision_tree', DecisionTree, 0.780, 0.643, 3), ('extra_trees', ExtraTreesClassifier, 0.780, 0.8, 3), ('gradient_boosting', GradientBoostingClassifier, 0.737, 0.684, 3), ('random_forest', RandomForest, 0.780, 0.789, 3), ('libsvm_svc', LibSVM_SVC, 0.769, 0.72, 3), ('liblinear_svc', LibLinear_SVC, 0.762, 0.735, 3), ('passive_aggressive', PassiveAggressive, 0.642, 0.449, 3), ('sgd', SGD, 0.818, 0.575, 2) ]: for strategy, acc in [ ('none', acc_no_weighting), ('weighting', acc_weighting) ]: # Fit data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': [name], 'preprocessor': ['no_preprocessing']} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=places, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] classifier = SimpleClassificationPipeline( default, random_state=1, include=include) classifier.set_hyperparameters(configuration=default) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=places) for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', ExtraTreesPreprocessorClassification, 0.810, 0.563), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.837, 0.567)]: for strategy, acc in [('none', acc_no_weighting), ('weighting', acc_weighting)]: data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] include = {'classifier': ['sgd'], 'preprocessor': [name]} classifier = SimpleClassificationPipeline( random_state=1, include=include) cs = classifier.get_hyperparameter_search_space() default = cs.get_default_configuration() default._values['balancing:strategy'] = strategy classifier.set_hyperparameters(default) predictor = classifier.fit(X_train, Y_train) predictions = predictor.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3, msg=(name, strategy)) # fit_transformer and fit_estimator data_ = copy.copy(data) X_train = data_[0][:100] Y_train = data_[1][:100] X_test = data_[0][100:] Y_test = data_[1][100:] default._values['balancing:strategy'] = strategy classifier = SimpleClassificationPipeline( default, random_state=1, include=include) Xt, fit_params = classifier.fit_transformer(X_train, Y_train) classifier.fit_estimator(Xt, Y_train, **fit_params) predictions = classifier.predict(X_test) self.assertAlmostEqual( sklearn.metrics.f1_score(predictions, Y_test), acc, places=3)