Esempio n. 1
0
    def test_reindex_columns(self):
        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split

        from autoflow.core.classifier import AutoFlowClassifier

        X, y = load_iris(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=42)
        pipe = AutoFlowClassifier(
            DAG_workflow={"num->target": [
                "logistic_regression",
            ]},
            initial_runs=1,
            run_limit=1,
            n_jobs=1,
            debug=True,
            search_method="smac",
            random_state=0,
            resource_manager=self.mock_resource_manager)
        pipe.fit(X_train, y_train, X_test, y_test)
        X_test = pipe.data_manager.X_test.data
        X_test = X_test[[f'column_{i}' for i in range(3, -1, -1)]]
        # score = accuracy_score(y_test, y_pred)
        score = pipe.score(X_test, y_test)
        print(score)
        assert score > 0.8
Esempio n. 2
0
 def test_should_stack_X(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->scale":
         "scale.standardize",
         "scale->trans":
         "transform.power",
         "trans->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=1,
                               run_limit=1,
                               debug=True,
                               resource_manager=self.mock_resource_manager,
                               should_stack_X=False,
                               log_file=self.log_file)
     pipe.fit(X_train, y_train)
     score = pipe.score(X_test, y_test)
     print(score)
     self.assertGreater(score, 0.5)
     self.update_log_path(pipe)
     for (level, logger, msg) in self.iter_log_items():
         if msg == STACK_X_MSG:
             print((level, logger, msg))
         assert msg != STACK_X_MSG
Esempio n. 3
0
 def test_2(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=42)
     hdl_constructors = [
         HDL_Constructor(DAG_workflow={
             "num->target": ["linearsvc", "svc", "logistic_regression"]
         }, )
     ] * 2
     tuners = [
         Tuner(search_method="random", run_limit=3, n_jobs=3, debug=True),
         Tuner(search_method="smac",
               initial_runs=3,
               run_limit=6,
               n_jobs=3,
               debug=True)
     ]
     pipe = AutoFlowClassifier(
         hdl_constructor=hdl_constructors,
         tuner=tuners,
         resource_manager=self.http_mock_resource_manager)
     pipe.fit(X_train, y_train, fit_ensemble_params=False)
     assert isinstance(pipe.estimator, VoteClassifier)
     # score = accuracy_score(y_test, y_pred)
     score = pipe.score(X_test, y_test)
     assert score > 0.8
Esempio n. 4
0
 def test_single_classifier(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=1,
                               run_limit=1,
                               debug=True,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train, y_train)
     # score = accuracy_score(y_test, y_pred)
     score = pipe.score(X_test, y_test)
     print(score)
     self.assertGreater(score, 0.5)
Esempio n. 5
0
 def test_1(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=42)
     pipe = AutoFlowClassifier(
         DAG_workflow={
             "num->target": ["linearsvc", "svc", "logistic_regression"]
         },
         initial_runs=3,
         run_limit=9,
         n_jobs=3,
         debug=True,
         resource_manager=self.http_mock_resource_manager)
     pipe.fit(X_train, y_train, fit_ensemble_params="auto")
     assert isinstance(pipe.estimator, StackClassifier)
     score = pipe.score(X_test, y_test)
     assert score > 0.8
Esempio n. 6
0
 def test_single_classifier_with_X_test(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=1,
                               run_limit=1,
                               debug=True,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train, y_train, X_test, y_test)
     y_pred = pipe.predict(X_test)
     # score = accuracy_score(y_test, y_pred)
     score = pipe.score(X_test, y_test)
     print(score)
     self.assertGreater(score, 0.8)
     pipe.resource_manager.init_trial_table()
     trial = pipe.resource_manager.TrialModel
     records = trial.select().where(
         trial.experiment_id == pipe.experiment_id)
     for record in records:
         self.assertTrue(record is not None)
         self.assertTrue(
             isinstance(record.test_all_score, dict)
             and bool(record.test_all_score)
             and record.test_all_score["accuracy"] > 0.8)
     pipe.resource_manager.close_trial_table()
Esempio n. 7
0
 def test_dirty_label(self):
     X, y = load_iris(return_X_y=True)
     y = y.astype("str")
     y[y == '0'] = "apple"
     y[y == '1'] = "pear"
     y[y == '2'] = "banana"
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=1,
                               run_limit=1,
                               debug=True,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train, y_train)
     # score = accuracy_score(y_test, y_pred)
     score = pipe.score(X_test, y_test)
     print(score)
     self.assertGreater(score, 0.8)
     self.assertTrue(
         np.all(pipe.data_manager.label_encoder.classes_ == array(
             ['apple', 'banana', 'pear'], dtype=object)))
Esempio n. 8
0
 def test_close_all(self):
     # todo : 增加预测与集成学习的案例
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=2,
                               run_limit=2,
                               debug=True,
                               log_file=self.log_file,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train,
              y_train,
              splitter=ShuffleSplit(n_splits=1,
                                    test_size=0.2,
                                    random_state=42),
              fit_ensemble_params=False
              # fixme: 目前不支持对hold out验证的集成学习
              )
     # score = accuracy_score(y_test, y_pred)
     score = pipe.score(X_test, y_test)
     print(score)
     # ----analyzing-----
     stack_cnt = 0
     self.update_log_path(pipe)
     for (level, logger, msg) in self.iter_log_items():
         if logger == RESOURCE_MANAGER_CLOSE_ALL_LOGGER:
             print("MESSAGE :", msg)
             if msg == START_SAFE_CLOSE_MSG.strip():
                 stack_cnt += 1
             elif msg == END_SAFE_CLOSE_MSG.strip():
                 stack_cnt -= 1
             elif msg == CONNECTION_POOL_CLOSE_MSG.strip():
                 if stack_cnt > 0:
                     pass
                 else:
                     raise Exception  # be completely wrapped
Esempio n. 9
0
 def test_ensemble_classifiers(self):
     X, y = load_iris(return_X_y=True)
     y = y.astype("str")
     y[y == '0'] = "apple"
     y[y == '1'] = "pear"
     y[y == '2'] = "banana"
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     pipe = AutoFlowClassifier(
         DAG_workflow={
             "num->target": ["linearsvc", "svc", "logistic_regression"]
         },
         initial_runs=2,
         run_limit=2,
         n_jobs=2,
         resource_manager=self.mock_resource_manager,
         debug=True,
     )
     pipe.fit(X_train,
              y_train,
              splitter=ShuffleSplit(n_splits=2,
                                    test_size=0.2,
                                    random_state=42))
     score = pipe.score(X_test, y_test)
     print(score)
     assert pipe.estimator.stacked_y_true.shape == (46, )
     assert np.all(
         pipe.estimator.prediction_list[0].sum(axis=1) - 1 < 0.001)
     assert pipe.estimator.prediction_list[0].shape == (46, 3)
     assert score > 0.8
     for splitter in [
             # LeaveOneOut(),
             ShuffleSplit(n_splits=20, test_size=0.3, random_state=42),
             KFold()
     ]:
         pipe.fit(X_train, y_train, splitter=splitter)
         score = pipe.score(X_test, y_test)
         assert score > 0.8
         print("splitter:", splitter)
         print("test accuracy:", score)
Esempio n. 10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from autoflow.core.classifier import AutoFlowClassifier
from autoflow.data_container import DataFrameContainer
from autoflow.data_container import NdArrayContainer

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_test_ = DataFrameContainer(dataset_instance=X_test)
y_test_ = NdArrayContainer(dataset_instance=y_test)
pipe = AutoFlowClassifier()
estimator = pipe.fit_ensemble(
    task_id="2435e32babd7d09b6357e99aa7fa3b89",
    budget_id="afff102b36a43efe4f68e299ff21cadd",
    trials_fetcher_params={"k": 50}
)
# pipe.fit(X_train, y_train, fit_ensemble_params=False)
# score = accuracy_score(y_test, y_pred)
y_pred = estimator.predict(X_test_)
score = accuracy_score(y_test, y_pred)
print(score)
Esempio n. 11
0
 def test(self):
     X, y = load_iris(return_X_y=True)
     y = y.astype("str")
     y[y == '0'] = "apple"
     y[y == '1'] = "pear"
     y[y == '2'] = "banana"
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=42)
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=6,
                               run_limit=6,
                               n_jobs=2,
                               debug=True,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train,
              y_train,
              splitter=ShuffleSplit(n_splits=2,
                                    test_size=0.2,
                                    random_state=42),
              fit_ensemble_params=False)
     assert pipe.experiment_id == 1
     data_manager = pipe.data_manager.copy(keep_data=False)
     assert data_manager.X_train is None
     assert pipe.data_manager.X_train is not None
     #######################################################
     ensemble_pipe1 = AutoFlowClassifier(
         resource_manager=self.mock_resource_manager)
     data_manager = deepcopy(data_manager)
     data_manager.resource_manager = ensemble_pipe1.resource_manager
     assert data_manager.X_train is None
     ensemble_pipe1.data_manager = data_manager
     ensemble_pipe1.estimator = ensemble_pipe1.fit_ensemble(
         task_id=pipe.task_id,
         trials_fetcher="GetSpecificTrials",
         trials_fetcher_params={"trial_ids": [0, 1, 2, 3, 4]})
     assert ensemble_pipe1.experiment_id == 2
     score = ensemble_pipe1.score(X_test, y_test)
     assert score > 0.8
     assert len(ensemble_pipe1.estimator.estimators_list) == 4
     #######################################################
     ensemble_pipe2 = AutoFlowClassifier(
         resource_manager=self.mock_resource_manager)
     data_manager = deepcopy(data_manager)
     data_manager.resource_manager = ensemble_pipe2.resource_manager
     assert data_manager.X_train is None
     ensemble_pipe2.data_manager = data_manager
     ensemble_pipe2.estimator = ensemble_pipe2.fit_ensemble(
         task_id=pipe.task_id,
         trials_fetcher="GetBestK",
         trials_fetcher_params={"k": 5})
     assert ensemble_pipe2.experiment_id == 3
     score = ensemble_pipe2.score(X_test, y_test)
     assert score > 0.8
     assert len(ensemble_pipe2.estimator.estimators_list) == 5
Esempio n. 12
0
pipe = AutoFlowClassifier(
    DAG_workflow={
        "num->target": [
            "linearsvc",
            "svc",
            "logistic_regression",
            "random_forest",
            # "catboost",
        ]
    },
    config_generator="ET",
    config_generator_params={
        # "acq_func": "EI",
        # "xi": 0,
        # "loss_transformer":None,
        # "bw_method": "scott",
        # "n_samples": 5000,
        "min_points_in_model": 50,
        "use_local_search": True,
        # "use_thompson_sampling":False,
        # "kde_sample_weight_scaler": None
    },
    warm_start=False,
    random_state=0,
    min_n_samples_for_SH=50,
    concurrent_type="thread",
    # max_budget=1,
    n_jobs_in_algorithm=3,
    n_workers=1,
    SH_only=True,
    min_budget=1 / 16,
    max_budget=1 / 16,
    n_iterations=100,
    # min_budget=1 / 4,
    debug_evaluator=True,
)
            'Content-Type': 'application/json',
            'accept': 'application/json',
        }
    })
hdl_constructors = [
    HDL_Constructor(DAG_workflow={
        "num->target": ["linearsvc", "svc", "logistic_regression"]
    }, )
] * 2
tuners = [
    Tuner(search_method="random", run_limit=3, n_jobs=3, debug=True),
    Tuner(search_method="smac",
          initial_runs=3,
          run_limit=6,
          n_jobs=3,
          debug=True)
]
pipe = AutoFlowClassifier(hdl_constructor=hdl_constructors,
                          tuner=tuners,
                          resource_manager=http_resource_manager)
pipe.fit(
    X_train,
    y_train,
    # fit_ensemble_params="auto",
    fit_ensemble_params=False,
)
assert isinstance(pipe.estimator, VoteClassifier)
# score = accuracy_score(y_test, y_pred)
score = pipe.score(X_test, y_test)
assert score > 0.8
Esempio n. 14
0
pipe = AutoFlowClassifier(
    imbalance_threshold=1,
    should_record_workflow_step=False,
    db_type="postgresql",
    db_params={
        "user": "******",
        "host": "0.0.0.0",
        "port": 5432
    },
    search_record_db_name="autoflow_test",
    config_generator="ET",
    config_generator_params={
        # "acq_func": "EI",
        # "xi": 0,
        # "loss_transformer":None,
        # "bw_method": "scott",
        # "n_samples": 5000,
        "min_points_in_model": min_points_in_model,
        "use_local_search": False,
        "use_thompson_sampling": False,
        # "kde_sample_weight_scaler": None
    },
    n_folds=3,
    warm_start=False,
    random_state=0,
    min_n_samples_for_SH=50,
    concurrent_type="process",
    n_workers=n_workers,
    SH_only=True,
    min_budget=4,
    max_budget=4,
    n_iterations=n_iterations,
    debug_evaluator=True,
    initial_points=initial_points)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from autoflow.core.classifier import AutoFlowClassifier

X, y = load_iris(return_X_y=True)
# X = X[y != 2]
# y = y[y != 2]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = AutoFlowClassifier(DAG_workflow={
    "num->scaled": {
        "_name": "scale.standardize",
        "_vanilla": True
    },
    "scaled->target": {
        "_name": "linearsvc",
        "random_state": 42,
        "_vanilla": True
    }
},
                          initial_runs=3,
                          run_limit=9,
                          n_jobs=3,
                          debug=True,
                          search_method="smac",
                          random_state=0)
pipe.fit(X_train, y_train, fit_ensemble_params=False)
# score = accuracy_score(y_test, y_pred)
score = pipe.score(X_test, y_test)
print(score)