def start_tuner(self, tuner: Tuner, hdl: dict): self.logger.debug( f"Start fine tune task, \nwhich HDL(Hyperparams Descriptions Language) is:\n{hdl}" ) self.logger.debug(f"which Tuner is:\n{tuner}") tuner.set_data_manager(self.data_manager) tuner.set_random_state(self.random_state) tuner.set_hdl(hdl) # just for get shps of tuner if estimate_config_space_numbers(tuner.shps) == 1: self.logger.info( "HDL(Hyperparams Descriptions Language) is a constant space, using manual modeling." ) dhp, self.estimator = tuner.evaluator.shp2model( tuner.shps.sample_configuration()) self.estimator.fit(self.data_manager.X_train, self.data_manager.y_train) return {"is_manual": True} n_jobs = tuner.n_jobs run_limits = [math.ceil(tuner.run_limit / n_jobs)] * n_jobs is_master_list = [False] * n_jobs is_master_list[0] = True initial_configs_list = get_chunks(tuner.design_initial_configs(n_jobs), n_jobs) random_states = np.arange(n_jobs) + self.random_state if n_jobs > 1 and tuner.search_method != "grid": sync_dict = Manager().dict() sync_dict["exit_processes"] = tuner.exit_processes else: sync_dict = None self.resource_manager.close_trials_table() self.resource_manager.clear_pid_list() self.resource_manager.close_redis() resource_managers = [ deepcopy(self.resource_manager) for i in range(n_jobs) ] tuners = [deepcopy(tuner) for i in range(n_jobs)] processes = [] # todo: 重构 sync_dict for tuner, resource_manager, run_limit, initial_configs, is_master, random_state in \ zip(tuners, resource_managers, run_limits, initial_configs_list, is_master_list, random_states): args = (tuner, resource_manager, run_limit, initial_configs, is_master, random_state, sync_dict) if n_jobs == 1: self.run(*args) else: p = multiprocessing.Process(target=self.run, args=args) processes.append(p) p.start() for p in processes: p.join() return {"is_manual": False}
# 标准化 stdScale = StandardScaler().fit(newfeature) newfeaturenorm = stdScale.transform(newfeature) # 区间化 bins = [-9, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 24] new_range = pd.cut(df.Label, bins) newlabel = np.array(df.Label) return newfeaturenorm, newlabel, new_range x_train, y_train, y_range = data_preprocessing() tuner = Tuner( initial_runs=12, run_limit=120, ) hdl_constructor = HDL_Constructor( DAG_workflow={ "num->target": "lightgbm" } ) resource_manager = ResourceManager(os.getcwd() + "/for_hxw_result") autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor, ensemble_builder=False) autoflow_pipeline.fit( X_train=x_train, y_train=y_train, n_jobs=3 ) joblib.dump(autoflow_pipeline, "autoflow_pipeline_for_hxw.bz")
import pandas as pd from sklearn.model_selection import ShuffleSplit from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=30, run_limit=0, ) hdl_constructor = HDL_Constructor( DAG_workflow={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "over_sample": [ "balance.under_sample.all_knn", "balance.under_sample.cluster_centroids", "balance.under_sample.condensed_nearest_neighbour", "balance.under_sample.edited_nearest_neighbours", "balance.under_sample.instance_hardness_threshold", "balance.under_sample.near_miss", "balance.under_sample.neighbourhood_cleaning_rule", "balance.under_sample.one_sided_selection",
import pandas as pd from sklearn.model_selection import ShuffleSplit from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=8, run_limit=12, ) hdl_constructor = HDL_Constructor( DAG_workflow={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "over_sample": [ "balance.over_sample.random", "balance.over_sample.adasyn", "balance.over_sample.borderline_smote", "balance.over_sample.kmeans_smote", "balance.over_sample.random", "balance.over_sample.smote", # "balance.over_sample.smotenc", "balance.over_sample.svmsmote",
"_name": "<mask>", "_select_percent": { "_type": "quniform", "_value": [1, 100, 0.5], "_default": 80 } }, "selected->target": { "_name": "logistic_regression", "_vanilla": True } }), ] tuners = [ Tuner(run_limit=-1, search_method="grid", n_jobs=3, debug=True), Tuner(run_limit=50, initial_runs=10, search_method="smac", n_jobs=3, debug=True), ] autoflow_pipeline = AutoFlowEstimator(tuners, hdl_constructors) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } autoflow_pipeline.fit(X_train=df_train, column_descriptions=column_descriptions)
import pandas as pd from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner df = pd.read_csv("../data/QSAR.csv") hdl_constructor = HDL_Constructor( DAG_workflow={ "num->var": "compress.variance", "var->pea": { "_name": "compress.pearson", "n_jobs": 6 }, "pea->target": "logistic_regression" }) tuner = Tuner(run_limit=5, initial_runs=12, search_method="smac", n_jobs=1) autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor) column_descriptions = {"id": "Name", "target": "labels"} autoflow_pipeline.fit(X_train=df, column_descriptions=column_descriptions)
from sklearn.model_selection import ShuffleSplit import autoflow from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner examples_path = Path(autoflow.__file__).parent.parent / "examples" df = pd.read_csv(examples_path / "data/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=1, run_limit=12, n_jobs=1, # debug=True ) hdl_constructor = HDL_Constructor( DAG_workflow={ "highR_nan->nan": "operate.merge", "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "num->target": "reduce.pca|lightgbm" }) autoflow_pipeline = AutoFlowEstimator( tuner, hdl_constructor, ) column_descriptions = {
df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] hdl_constructor = HDL_Constructor( DAG_workflow={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "lowR_nan->nan": "impute.fill_abnormal", "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": ["encode.cat_boost", "encode.target", "encode.label"], "num->target": {"_name": "lightgbm", "_vanilla": True} } ) tuner = Tuner( run_limit=-1, search_method="grid" ) autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } autoflow_pipeline.fit( X_train=df_train, X_test=df_test, column_descriptions=column_descriptions )
import pandas as pd from sklearn.model_selection import ShuffleSplit from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=10, ) hdl_constructor = HDL_Constructor( DAG_workflow={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "lowR_nan->nan": [], "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "num->target": "lightgbm" }) autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name"
import pandas as pd from sklearn.model_selection import ShuffleSplit from autoflow.estimator.base import AutoFlowEstimator from autoflow.hdl.hdl_constructor import HDL_Constructor from autoflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner(initial_runs=1, run_limit=12, n_jobs=1) hdl_constructor = HDL_Constructor( DAG_workflow={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "num->target": "reduce.pca|lightgbm" }) autoflow_pipeline = AutoFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } autoflow_pipeline.fit(X_train=df_train, X_test=df_test, column_descriptions=column_descriptions)