def __init__( self, tuner: Union[Tuner, List[Tuner], None, dict] = None, hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None, resource_manager: Union[ResourceManager, str] = None, ensemble_builder: Union[StackEnsembleBuilder, None, bool, int] = None, random_state=42 ): # ---logger------------------------------------ self.logger = get_logger(__name__) # ---random_state----------------------------------- self.random_state = random_state # ---ensemble_builder----------------------------------- if ensemble_builder is None: self.logger.info("Using default StackEnsembleBuilder.") ensemble_builder = StackEnsembleBuilder() elif ensemble_builder == False: self.logger.info("Not using EnsembleBuilder, will select the best estimator.") else: ensemble_builder = StackEnsembleBuilder(set_model=ensemble_builder) self.ensemble_builder = ensemble_builder # ---tuners----------------------------------- if not tuner: tuner = Tuner() if not isinstance(tuner, (list, tuple)): tuner = [tuner] self.tuners: List[Tuner] = tuner # ---hdl_constructors----------------------------------- if not hdl_constructor: hdl_constructor = HDL_Constructor() if not isinstance(hdl_constructor, (list, tuple)): hdl_constructor = [hdl_constructor] self.hdl_constructors = hdl_constructor # ---resource_manager----------------------------------- if resource_manager is None: resource_manager = ResourceManager() self.resource_manager = resource_manager # ---member_variable------------------------------------ self.estimator = None
from hyperflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=12, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost|scale.standardize", "num->target": "reduce.pca|lightgbm" }) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } hyperflow_pipeline.fit(X=df_train, X_test=df_test, column_descriptions=column_descriptions, n_jobs=1)
df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=10, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "{nan_grp=lowR_nan}": "impute.delete", "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "num->target": "lightgbm" } ) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } hyperflow_pipeline.fit( X=df_train, X_test=df_test, column_descriptions=column_descriptions, n_jobs=1 )
train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=12, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": [ "encode.binary", "encode.cat_boost", "encode.hash", "encode.label", "encode.leave_one_out", "encode.one_hot", "encode.target", "encode.weight_of_evidence" ], "num->target": { "_name": "lightgbm", "_vanilla": True } }) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } hyperflow_pipeline.fit(X=df_train,
from hyperflow.estimator.base import HyperFlowEstimator from hyperflow.hdl.hdl_constructor import HDL_Constructor from hyperflow.tuner.tuner import Tuner df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "lowR_nan->nan": "impute.fill_abnormal", "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "num->target": { "_name": "lightgbm", "_vanilla": True } }) tuner = Tuner(run_limit=-1, search_method="grid") hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } hyperflow_pipeline.fit(X=df_train, X_test=df_test,
boston = load_boston() data = boston.data target = boston.target columns = list(boston.feature_names) + ["target"] df = pd.DataFrame(np.hstack([data, target[:, None]]), columns=columns) ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=12, ) hyperflow_pipeline = HyperFlowEstimator( tuner, HDL_Constructor( DAG_descriptions={ "num->num": [ "select.from_model_reg", "select.univar_reg", "select.rfe_reg" #,None ], "num->target": ["lightgbm"] })) column_descriptions = {"target": "target"} hyperflow_pipeline.fit(X=df_train, X_test=df_test, column_descriptions=column_descriptions, n_jobs=1)
tuner = Tuner( initial_runs=5, run_limit=12, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": [ "encode.cat_boost", { "_name": "encode.label", "__proba": 0.8 }, ], "num->num": ["scale.normalize", { "_name": "None", "__proba": 0.8 }], "num->target": { "_name": "catboost", "_vanilla": False } }) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name"
tuner = Tuner( initial_runs=30, run_limit=0, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "over_sample": [ "balance.under_sample.all_knn", "balance.under_sample.cluster_centroids", "balance.under_sample.condensed_nearest_neighbour", "balance.under_sample.edited_nearest_neighbours", "balance.under_sample.instance_hardness_threshold", "balance.under_sample.near_miss", "balance.under_sample.neighbourhood_cleaning_rule", "balance.under_sample.one_sided_selection", "balance.under_sample.random", "balance.under_sample.repeated_edited_nearest_neighbours", "balance.under_sample.tomek_links", ], "num->target": {"_name": "lightgbm", "_vanilla": True} } ) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name"
df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] hdl_constructors = [ HDL_Constructor( DAG_descriptions={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "lowR_nan->nan": "impute.fill_abnormal", "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.label", "num->num": [ {"_name": "select.from_model_clf", "_select_percent": 80}, {"_name": "select.rfe_clf", "_select_percent": 80}, # {"_name": "select.univar_clf", "_select_percent": 80}, ], "num->target": {"_name": "lightgbm", "_vanilla": True} } ), HDL_Constructor( DAG_descriptions={ "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan", "lowR_nan->nan": "impute.fill_abnormal", "highR_nan->nan": "operate.drop", "all->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.label", "num->num": {"_name": "<placeholder>",
from hyperflow.estimator.base import HyperFlowEstimator from hyperflow.hdl.hdl_constructor import HDL_Constructor from hyperflow.tuner.tuner import Tuner digits = load_digits() data = digits.data target = digits.target columns = [str(i) for i in range(data.shape[1])] + ["target"] df = pd.DataFrame(np.hstack([data, target[:, None]]), columns=columns) df["target"] = df["target"].astype("int") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=5, run_limit=12, ) hyperflow_pipeline = HyperFlowEstimator( tuner, HDL_Constructor(DAG_descriptions={ "num->num": ["select.from_model_clf"], "num->target": ["lightgbm"] })) column_descriptions = {"target": "target"} hyperflow_pipeline.fit(X=df_train, X_test=df_test, column_descriptions=column_descriptions, n_jobs=1)
df = pd.read_csv("../examples/classification/train_classification.csv") ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25) train_ix, test_ix = next(ss.split(df)) df_train = df.iloc[train_ix, :] df_test = df.iloc[test_ix, :] tuner = Tuner( initial_runs=30, run_limit=0, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "remove_abnormal": ["abnormal.isolation_forest"], "num->target": { "_name": "lightgbm", "_vanilla": False } }) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } hyperflow_pipeline.fit(X=df_train, X_test=df_test, column_descriptions=column_descriptions,
tuner = Tuner( initial_runs=8, run_limit=12, ) hdl_constructor = HDL_Constructor( DAG_descriptions={ "nan->imp": "impute.fill_abnormal", "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num", "cat->num": "encode.cat_boost", "over_sample": [ "balance.over_sample.random", "balance.over_sample.adasyn", "balance.over_sample.borderline_smote", "balance.over_sample.kmeans_smote", "balance.over_sample.random", "balance.over_sample.smote", # "balance.over_sample.smotenc", "balance.over_sample.svmsmote", ], "num->target": { "_name": "lightgbm", "_vanilla": True } }) hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = { "id": "PassengerId", "target": "Survived",
import pandas as pd from hyperflow.estimator.base import HyperFlowEstimator from hyperflow.hdl.hdl_constructor import HDL_Constructor from hyperflow.tuner.tuner import Tuner df = pd.read_csv("../data/QSAR.csv") hdl_constructor = HDL_Constructor( DAG_descriptions={ "num->var": "compress.variance", "var->pea": { "_name": "compress.pearson", "n_jobs": 6 }, "pea->target": "logistic_regression" }) tuner = Tuner(run_limit=12, initial_runs=12, search_method="smac") hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor) column_descriptions = {"id": "Name", "target": "labels"} hyperflow_pipeline.fit(X=df, column_descriptions=column_descriptions, n_jobs=3)
if ind.any(): temp = newfeature[:, i] a = temp[~np.isnan(temp)].mean() newfeature[:, i][np.isnan(temp)] = a # 标准化 stdScale = StandardScaler().fit(newfeature) newfeaturenorm = stdScale.transform(newfeature) # 区间化 bins = [-9, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 24] new_range = pd.cut(df.Label, bins) newlabel = np.array(df.Label) return newfeaturenorm, newlabel, new_range x_train, y_train, y_range = data_preprocessing() tuner = Tuner( initial_runs=12, run_limit=120, ) hdl_constructor = HDL_Constructor(DAG_descriptions={"num->target": "lightgbm"}) resource_manager = ResourceManager(os.getcwd() + "/for_hxw_result") hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor, ensemble_builder=False) hyperflow_pipeline.fit(X=x_train, y=y_train, n_jobs=3) joblib.dump(hyperflow_pipeline, "hyperflow_pipeline_for_hxw.bz")