def test_regression_xgboost(self): X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1) automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2) automl_settings = { "time_budget": 2, "estimator_list": ["my_xgb1", "my_xgb2"], "task": "regression", "log_file_name": "test/regression_xgboost.log", "n_jobs": 1, "model_history": True, "keep_search_state": True, "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("my_xgb2")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_time_limit(self): automl_experiment = AutoML() automl_experiment.add_learner( learner_name="large_lgbm", learner_class=MyLargeLGBM ) automl_experiment.add_learner( learner_name="large_xgb", learner_class=MyLargeXGB ) automl_settings = { "time_budget": 0.5, "task": "classification", "log_file_name": "test/classification_timeout.log", "estimator_list": ["catboost"], "log_type": "all", "hpo_method": "random", } X_train, y_train = load_iris(return_X_y=True, as_frame=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.model.params) automl_settings["estimator_list"] = ["large_xgb"] automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.model) automl_settings["estimator_list"] = ["large_lgbm"] automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.model)
def test_random_skip_oom(self): automl_experiment = AutoML() automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM) automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": "test/sparse_classification_oom.log", "estimator_list": ["large_lgbm"], "log_type": "all", "n_jobs": 1, "hpo_method": "random", "n_concurrent_trials": 2, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) try: automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("large_lgbm")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: print("skipping concurrency test as ray is not installed") return
def test_custom_learner(self): automl = AutoML() automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest) X_train, y_train = load_wine(return_X_y=True) settings = { "time_budget": 10, # total running time in seconds "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'], "task": 'classification', # task type "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric "n_jobs": 1, } '''The main flaml automl API''' automl.fit(X_train=X_train, y_train=y_train, **settings)
def _test_memory_limit(self): automl_experiment = AutoML() automl_experiment.add_learner( learner_name="large_lgbm", learner_class=MyLargeLGBM ) automl_settings = { "time_budget": -1, "task": "classification", "log_file_name": "test/classification_oom.log", "estimator_list": ["large_lgbm"], "log_type": "all", "hpo_method": "random", } X_train, y_train = load_iris(return_X_y=True, as_frame=True) automl_experiment.fit( X_train=X_train, y_train=y_train, max_iter=1, **automl_settings ) print(automl_experiment.model)
def test_ensemble(self): automl = AutoML() automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest) X_train, y_train = load_wine(return_X_y=True) settings = { "time_budget": 5, # total running time in seconds "estimator_list": ["rf", "xgboost", "catboost"], "task": "classification", # task type "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric "ensemble": { "final_estimator": MyRegularizedGreedyForest(), "passthrough": False, }, "n_jobs": 1, } """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings)
def test_custom_learner(self): automl = AutoML() automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest) X_train, y_train = load_wine(return_X_y=True) settings = { "time_budget": 8, # total running time in seconds "estimator_list": ["RGF", "lgbm", "rf", "xgboost"], "task": "classification", # task type "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric "n_jobs": 1, } """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings) # print the best model found for RGF print(automl.best_model_for_estimator("RGF")) MyRegularizedGreedyForest.search_space = lambda data_size, task: {} automl.fit(X_train=X_train, y_train=y_train, **settings)
data_dir="./") X_train = X_train.iloc[:1000] y_train = y_train.iloc[:1000] class ExtraTreesEstimatorSeeded(ExtraTreesEstimator): """ExtraTreesEstimator for reproducible FLAML run.""" def config2params(self, config: dict) -> dict: params = super().config2params(config) params["random_state"] = 0 return params settings = { "time_budget": 1e10, # total running time in seconds "max_iter": 3, "metric": "ap", # average_precision "task": "classification", # task type "seed": 7654321, # random seed "estimator_list": ["extra_trees_seeded"], "verbose": False, } for trial_num in range(8): automl = AutoML() automl.add_learner(learner_name="extra_trees_seeded", learner_class=ExtraTreesEstimatorSeeded) automl.fit(X_train=X_train, y_train=y_train, **settings) print(automl.best_loss) print(automl.best_config)
def test_fit_w_freezinghp_starting_point(self, as_frame=True): automl_experiment = AutoML() automl_settings = { "time_budget": 1, "metric": "accuracy", "task": "classification", "estimator_list": ["lgbm"], "log_file_name": "test/iris.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) if as_frame: # test drop column X_train.columns = range(X_train.shape[1]) X_train[X_train.shape[1]] = np.zeros(len(y_train)) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) automl_val_accuracy = 1.0 - automl_experiment.best_loss print("Best ML leaner:", automl_experiment.best_estimator) print("Best hyperparmeter config:", automl_experiment.best_config) print("Best accuracy on validation data: {0:.4g}".format( automl_val_accuracy)) print("Training duration of best run: {0:.4g} s".format( automl_experiment.best_config_train_time)) # 1. Get starting points from previous experiments. starting_points = automl_experiment.best_config_per_estimator print("starting_points", starting_points) print("loss of the starting_points", automl_experiment.best_loss_per_estimator) starting_point = starting_points["lgbm"] hps_to_freeze = [ "colsample_bytree", "reg_alpha", "reg_lambda", "log_max_bin" ] # 2. Constrct a new class: # a. write the hps you want to freeze as hps with constant 'domain'; # b. specify the new search space of the other hps accrodingly. class MyPartiallyFreezedLargeLGBM(LGBMEstimator): @classmethod def search_space(cls, **params): # (1) Get the hps in the original search space space = LGBMEstimator.search_space(**params) # (2) Set up the fixed value from hps from the starting point for hp_name in hps_to_freeze: # if an hp is specifed to be freezed, use tine value provided in the starting_point # otherwise use the setting from the original search space if hp_name in starting_point: space[hp_name] = {"domain": starting_point[hp_name]} # (3.1) Configure the search space for hps that are in the original search space # but you want to change something, for example the range. revised_hps_to_search = { "n_estimators": { "domain": tune.lograndint(lower=10, upper=32768), "init_value": starting_point.get("n_estimators") or space["n_estimators"].get("init_value", 10), "low_cost_init_value": space["n_estimators"].get("low_cost_init_value", 10), }, "num_leaves": { "domain": tune.lograndint(lower=10, upper=3276), "init_value": starting_point.get("num_leaves") or space["num_leaves"].get("init_value", 10), "low_cost_init_value": space["num_leaves"].get("low_cost_init_value", 10), }, # (3.2) Add a new hp which is not in the original search space "subsample": { "domain": tune.uniform(lower=0.1, upper=1.0), "init_value": 0.1, }, } space.update(revised_hps_to_search) return space new_estimator_name = "large_lgbm" new_automl_experiment = AutoML() new_automl_experiment.add_learner( learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM) automl_settings_resume = { "time_budget": 3, "metric": "accuracy", "task": "classification", "estimator_list": [new_estimator_name], "log_file_name": "test/iris_resume.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, "log_type": "all", "starting_points": { new_estimator_name: starting_point }, } new_automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings_resume) new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss print("Best ML leaner:", new_automl_experiment.best_estimator) print("Best hyperparmeter config:", new_automl_experiment.best_config) print("Best accuracy on validation data: {0:.4g}".format( new_automl_val_accuracy)) print("Training duration of best run: {0:.4g} s".format( new_automl_experiment.best_config_train_time))