def test_reindex_columns(self): from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from autoflow.core.classifier import AutoFlowClassifier X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) pipe = AutoFlowClassifier( DAG_workflow={"num->target": [ "logistic_regression", ]}, initial_runs=1, run_limit=1, n_jobs=1, debug=True, search_method="smac", random_state=0, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train, X_test, y_test) X_test = pipe.data_manager.X_test.data X_test = X_test[[f'column_{i}' for i in range(3, -1, -1)]] # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score) assert score > 0.8
def test_should_stack_X(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier(DAG_workflow={ "num->scale": "scale.standardize", "scale->trans": "transform.power", "trans->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=1, run_limit=1, debug=True, resource_manager=self.mock_resource_manager, should_stack_X=False, log_file=self.log_file) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) print(score) self.assertGreater(score, 0.5) self.update_log_path(pipe) for (level, logger, msg) in self.iter_log_items(): if msg == STACK_X_MSG: print((level, logger, msg)) assert msg != STACK_X_MSG
def test_2(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) hdl_constructors = [ HDL_Constructor(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, ) ] * 2 tuners = [ Tuner(search_method="random", run_limit=3, n_jobs=3, debug=True), Tuner(search_method="smac", initial_runs=3, run_limit=6, n_jobs=3, debug=True) ] pipe = AutoFlowClassifier( hdl_constructor=hdl_constructors, tuner=tuners, resource_manager=self.http_mock_resource_manager) pipe.fit(X_train, y_train, fit_ensemble_params=False) assert isinstance(pipe.estimator, VoteClassifier) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) assert score > 0.8
def test_single_classifier(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=1, run_limit=1, debug=True, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score) self.assertGreater(score, 0.5)
def test_1(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) pipe = AutoFlowClassifier( DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=3, run_limit=9, n_jobs=3, debug=True, resource_manager=self.http_mock_resource_manager) pipe.fit(X_train, y_train, fit_ensemble_params="auto") assert isinstance(pipe.estimator, StackClassifier) score = pipe.score(X_test, y_test) assert score > 0.8
def test_single_classifier_with_X_test(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=1, run_limit=1, debug=True, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train, X_test, y_test) y_pred = pipe.predict(X_test) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score) self.assertGreater(score, 0.8) pipe.resource_manager.init_trial_table() trial = pipe.resource_manager.TrialModel records = trial.select().where( trial.experiment_id == pipe.experiment_id) for record in records: self.assertTrue(record is not None) self.assertTrue( isinstance(record.test_all_score, dict) and bool(record.test_all_score) and record.test_all_score["accuracy"] > 0.8) pipe.resource_manager.close_trial_table()
def test_dirty_label(self): X, y = load_iris(return_X_y=True) y = y.astype("str") y[y == '0'] = "apple" y[y == '1'] = "pear" y[y == '2'] = "banana" X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=1, run_limit=1, debug=True, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score) self.assertGreater(score, 0.8) self.assertTrue( np.all(pipe.data_manager.label_encoder.classes_ == array( ['apple', 'banana', 'pear'], dtype=object)))
def test_close_all(self): # todo : 增加预测与集成学习的案例 X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=2, run_limit=2, debug=True, log_file=self.log_file, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train, splitter=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), fit_ensemble_params=False # fixme: 目前不支持对hold out验证的集成学习 ) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score) # ----analyzing----- stack_cnt = 0 self.update_log_path(pipe) for (level, logger, msg) in self.iter_log_items(): if logger == RESOURCE_MANAGER_CLOSE_ALL_LOGGER: print("MESSAGE :", msg) if msg == START_SAFE_CLOSE_MSG.strip(): stack_cnt += 1 elif msg == END_SAFE_CLOSE_MSG.strip(): stack_cnt -= 1 elif msg == CONNECTION_POOL_CLOSE_MSG.strip(): if stack_cnt > 0: pass else: raise Exception # be completely wrapped
def test_ensemble_classifiers(self): X, y = load_iris(return_X_y=True) y = y.astype("str") y[y == '0'] = "apple" y[y == '1'] = "pear" y[y == '2'] = "banana" X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = AutoFlowClassifier( DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=2, run_limit=2, n_jobs=2, resource_manager=self.mock_resource_manager, debug=True, ) pipe.fit(X_train, y_train, splitter=ShuffleSplit(n_splits=2, test_size=0.2, random_state=42)) score = pipe.score(X_test, y_test) print(score) assert pipe.estimator.stacked_y_true.shape == (46, ) assert np.all( pipe.estimator.prediction_list[0].sum(axis=1) - 1 < 0.001) assert pipe.estimator.prediction_list[0].shape == (46, 3) assert score > 0.8 for splitter in [ # LeaveOneOut(), ShuffleSplit(n_splits=20, test_size=0.3, random_state=42), KFold() ]: pipe.fit(X_train, y_train, splitter=splitter) score = pipe.score(X_test, y_test) assert score > 0.8 print("splitter:", splitter) print("test accuracy:", score)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] from sklearn.datasets import load_digits from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from autoflow.core.classifier import AutoFlowClassifier from autoflow.data_container import DataFrameContainer from autoflow.data_container import NdArrayContainer X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_test_ = DataFrameContainer(dataset_instance=X_test) y_test_ = NdArrayContainer(dataset_instance=y_test) pipe = AutoFlowClassifier() estimator = pipe.fit_ensemble( task_id="2435e32babd7d09b6357e99aa7fa3b89", budget_id="afff102b36a43efe4f68e299ff21cadd", trials_fetcher_params={"k": 50} ) # pipe.fit(X_train, y_train, fit_ensemble_params=False) # score = accuracy_score(y_test, y_pred) y_pred = estimator.predict(X_test_) score = accuracy_score(y_test, y_pred) print(score)
def test(self): X, y = load_iris(return_X_y=True) y = y.astype("str") y[y == '0'] = "apple" y[y == '1'] = "pear" y[y == '2'] = "banana" X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=6, run_limit=6, n_jobs=2, debug=True, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train, splitter=ShuffleSplit(n_splits=2, test_size=0.2, random_state=42), fit_ensemble_params=False) assert pipe.experiment_id == 1 data_manager = pipe.data_manager.copy(keep_data=False) assert data_manager.X_train is None assert pipe.data_manager.X_train is not None ####################################################### ensemble_pipe1 = AutoFlowClassifier( resource_manager=self.mock_resource_manager) data_manager = deepcopy(data_manager) data_manager.resource_manager = ensemble_pipe1.resource_manager assert data_manager.X_train is None ensemble_pipe1.data_manager = data_manager ensemble_pipe1.estimator = ensemble_pipe1.fit_ensemble( task_id=pipe.task_id, trials_fetcher="GetSpecificTrials", trials_fetcher_params={"trial_ids": [0, 1, 2, 3, 4]}) assert ensemble_pipe1.experiment_id == 2 score = ensemble_pipe1.score(X_test, y_test) assert score > 0.8 assert len(ensemble_pipe1.estimator.estimators_list) == 4 ####################################################### ensemble_pipe2 = AutoFlowClassifier( resource_manager=self.mock_resource_manager) data_manager = deepcopy(data_manager) data_manager.resource_manager = ensemble_pipe2.resource_manager assert data_manager.X_train is None ensemble_pipe2.data_manager = data_manager ensemble_pipe2.estimator = ensemble_pipe2.fit_ensemble( task_id=pipe.task_id, trials_fetcher="GetBestK", trials_fetcher_params={"k": 5}) assert ensemble_pipe2.experiment_id == 3 score = ensemble_pipe2.score(X_test, y_test) assert score > 0.8 assert len(ensemble_pipe2.estimator.estimators_list) == 5
pipe = AutoFlowClassifier( DAG_workflow={ "num->target": [ "linearsvc", "svc", "logistic_regression", "random_forest", # "catboost", ] }, config_generator="ET", config_generator_params={ # "acq_func": "EI", # "xi": 0, # "loss_transformer":None, # "bw_method": "scott", # "n_samples": 5000, "min_points_in_model": 50, "use_local_search": True, # "use_thompson_sampling":False, # "kde_sample_weight_scaler": None }, warm_start=False, random_state=0, min_n_samples_for_SH=50, concurrent_type="thread", # max_budget=1, n_jobs_in_algorithm=3, n_workers=1, SH_only=True, min_budget=1 / 16, max_budget=1 / 16, n_iterations=100, # min_budget=1 / 4, debug_evaluator=True, )
'Content-Type': 'application/json', 'accept': 'application/json', } }) hdl_constructors = [ HDL_Constructor(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, ) ] * 2 tuners = [ Tuner(search_method="random", run_limit=3, n_jobs=3, debug=True), Tuner(search_method="smac", initial_runs=3, run_limit=6, n_jobs=3, debug=True) ] pipe = AutoFlowClassifier(hdl_constructor=hdl_constructors, tuner=tuners, resource_manager=http_resource_manager) pipe.fit( X_train, y_train, # fit_ensemble_params="auto", fit_ensemble_params=False, ) assert isinstance(pipe.estimator, VoteClassifier) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) assert score > 0.8
pipe = AutoFlowClassifier( imbalance_threshold=1, should_record_workflow_step=False, db_type="postgresql", db_params={ "user": "******", "host": "0.0.0.0", "port": 5432 }, search_record_db_name="autoflow_test", config_generator="ET", config_generator_params={ # "acq_func": "EI", # "xi": 0, # "loss_transformer":None, # "bw_method": "scott", # "n_samples": 5000, "min_points_in_model": min_points_in_model, "use_local_search": False, "use_thompson_sampling": False, # "kde_sample_weight_scaler": None }, n_folds=3, warm_start=False, random_state=0, min_n_samples_for_SH=50, concurrent_type="process", n_workers=n_workers, SH_only=True, min_budget=4, max_budget=4, n_iterations=n_iterations, debug_evaluator=True, initial_points=initial_points)
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from autoflow.core.classifier import AutoFlowClassifier X, y = load_iris(return_X_y=True) # X = X[y != 2] # y = y[y != 2] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) pipe = AutoFlowClassifier(DAG_workflow={ "num->scaled": { "_name": "scale.standardize", "_vanilla": True }, "scaled->target": { "_name": "linearsvc", "random_state": 42, "_vanilla": True } }, initial_runs=3, run_limit=9, n_jobs=3, debug=True, search_method="smac", random_state=0) pipe.fit(X_train, y_train, fit_ensemble_params=False) # score = accuracy_score(y_test, y_pred) score = pipe.score(X_test, y_test) print(score)