Beispiel #1
0
    def start_tuner(self, tuner: Tuner, hdl: dict):
        self.logger.debug(
            f"Start fine tune task, \nwhich HDL(Hyperparams Descriptions Language) is:\n{hdl}"
        )
        self.logger.debug(f"which Tuner is:\n{tuner}")
        tuner.set_data_manager(self.data_manager)
        tuner.set_random_state(self.random_state)
        tuner.set_hdl(hdl)  # just for get shps of tuner
        if estimate_config_space_numbers(tuner.shps) == 1:
            self.logger.info(
                "HDL(Hyperparams Descriptions Language) is a constant space, using manual modeling."
            )
            dhp, self.estimator = tuner.evaluator.shp2model(
                tuner.shps.sample_configuration())
            self.estimator.fit(self.data_manager.X_train,
                               self.data_manager.y_train)
            return {"is_manual": True}
        n_jobs = tuner.n_jobs
        run_limits = [math.ceil(tuner.run_limit / n_jobs)] * n_jobs
        is_master_list = [False] * n_jobs
        is_master_list[0] = True
        initial_configs_list = get_chunks(tuner.design_initial_configs(n_jobs),
                                          n_jobs)
        random_states = np.arange(n_jobs) + self.random_state
        if n_jobs > 1 and tuner.search_method != "grid":
            sync_dict = Manager().dict()
            sync_dict["exit_processes"] = tuner.exit_processes
        else:
            sync_dict = None
        self.resource_manager.close_trials_table()
        self.resource_manager.clear_pid_list()
        self.resource_manager.close_redis()
        resource_managers = [
            deepcopy(self.resource_manager) for i in range(n_jobs)
        ]
        tuners = [deepcopy(tuner) for i in range(n_jobs)]
        processes = []
        # todo: 重构 sync_dict
        for tuner, resource_manager, run_limit, initial_configs, is_master, random_state in \
                zip(tuners, resource_managers, run_limits, initial_configs_list, is_master_list, random_states):
            args = (tuner, resource_manager, run_limit, initial_configs,
                    is_master, random_state, sync_dict)
            if n_jobs == 1:
                self.run(*args)
            else:
                p = multiprocessing.Process(target=self.run, args=args)
                processes.append(p)
                p.start()
        for p in processes:
            p.join()

        return {"is_manual": False}
Beispiel #2
0
    # 标准化
    stdScale = StandardScaler().fit(newfeature)
    newfeaturenorm = stdScale.transform(newfeature)

    # 区间化
    bins = [-9, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 24]
    new_range = pd.cut(df.Label, bins)
    newlabel = np.array(df.Label)
    return newfeaturenorm, newlabel, new_range


x_train, y_train, y_range = data_preprocessing()

tuner = Tuner(
    initial_runs=12,
    run_limit=120,
)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "num->target": "lightgbm"
    }
)
resource_manager = ResourceManager(os.getcwd() + "/for_hxw_result")
autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor, ensemble_builder=False)

autoflow_pipeline.fit(
    X_train=x_train, y_train=y_train, n_jobs=3
)
joblib.dump(autoflow_pipeline, "autoflow_pipeline_for_hxw.bz")
Beispiel #3
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=30,
    run_limit=0,
)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "over_sample": [
            "balance.under_sample.all_knn",
            "balance.under_sample.cluster_centroids",
            "balance.under_sample.condensed_nearest_neighbour",
            "balance.under_sample.edited_nearest_neighbours",
            "balance.under_sample.instance_hardness_threshold",
            "balance.under_sample.near_miss",
            "balance.under_sample.neighbourhood_cleaning_rule",
            "balance.under_sample.one_sided_selection",
Beispiel #4
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=8,
    run_limit=12,
)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "over_sample": [
            "balance.over_sample.random",
            "balance.over_sample.adasyn",
            "balance.over_sample.borderline_smote",
            "balance.over_sample.kmeans_smote",
            "balance.over_sample.random",
            "balance.over_sample.smote",
            # "balance.over_sample.smotenc",
            "balance.over_sample.svmsmote",
Beispiel #5
0
                "_name": "<mask>",
                "_select_percent": {
                    "_type": "quniform",
                    "_value": [1, 100, 0.5],
                    "_default": 80
                }
            },
            "selected->target": {
                "_name": "logistic_regression",
                "_vanilla": True
            }
        }),
]

tuners = [
    Tuner(run_limit=-1, search_method="grid", n_jobs=3, debug=True),
    Tuner(run_limit=50,
          initial_runs=10,
          search_method="smac",
          n_jobs=3,
          debug=True),
]
autoflow_pipeline = AutoFlowEstimator(tuners, hdl_constructors)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}

autoflow_pipeline.fit(X_train=df_train,
                      column_descriptions=column_descriptions)
Beispiel #6
0
import pandas as pd

from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner

df = pd.read_csv("../data/QSAR.csv")

hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "num->var": "compress.variance",
        "var->pea": {
            "_name": "compress.pearson",
            "n_jobs": 6
        },
        "pea->target": "logistic_regression"
    })
tuner = Tuner(run_limit=5, initial_runs=12, search_method="smac", n_jobs=1)
autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor)
column_descriptions = {"id": "Name", "target": "labels"}

autoflow_pipeline.fit(X_train=df, column_descriptions=column_descriptions)
Beispiel #7
0
from sklearn.model_selection import ShuffleSplit

import autoflow
from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner
examples_path = Path(autoflow.__file__).parent.parent / "examples"
df = pd.read_csv(examples_path / "data/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=1,
    run_limit=12,
    n_jobs=1,
    # debug=True
)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "highR_nan->nan": "operate.merge",
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "num->target": "reduce.pca|lightgbm"
    })
autoflow_pipeline = AutoFlowEstimator(
    tuner,
    hdl_constructor,
)
column_descriptions = {
Beispiel #8
0
df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan",
        "lowR_nan->nan": "impute.fill_abnormal",
        "highR_nan->nan": "operate.drop",
        "all->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": ["encode.cat_boost", "encode.target", "encode.label"],
        "num->target": {"_name": "lightgbm", "_vanilla": True}
    }
)
tuner = Tuner(
    run_limit=-1,
    search_method="grid"
)
autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}

autoflow_pipeline.fit(
    X_train=df_train, X_test=df_test, column_descriptions=column_descriptions
)
Beispiel #9
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=5,
    run_limit=10,
)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan",
        "lowR_nan->nan": [],
        "highR_nan->nan": "operate.drop",
        "all->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "num->target": "lightgbm"
    })
autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
Beispiel #10
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from autoflow.estimator.base import AutoFlowEstimator
from autoflow.hdl.hdl_constructor import HDL_Constructor
from autoflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(initial_runs=1, run_limit=12, n_jobs=1)
hdl_constructor = HDL_Constructor(
    DAG_workflow={
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "num->target": "reduce.pca|lightgbm"
    })
autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}

autoflow_pipeline.fit(X_train=df_train,
                      X_test=df_test,
                      column_descriptions=column_descriptions)