def test_sparse_matrix_lr(self): automl_experiment = AutoML() automl_settings = { "time_budget": 3, "metric": "f1", "task": "classification", "log_file_name": "test/sparse_classification.log", "estimator_list": ["lrl1", "lrl2"], "log_type": "all", "n_jobs": 1, } X_train = scipy.sparse.random(3000, 3000, density=0.1) y_train = np.random.randint(2, size=3000) automl_experiment.fit(X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings) automl_settings["time_budget"] = 5 automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("lrl2")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_sparse_matrix_regression(self): X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": "mae", "task": "regression", "log_file_name": "test/sparse_regression.log", "n_jobs": 1, "model_history": True, "keep_search_state": True, "verbose": 0, "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_regression_xgboost(self): X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1) automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2) automl_settings = { "time_budget": 2, "estimator_list": ["my_xgb1", "my_xgb2"], "task": "regression", "log_file_name": "test/regression_xgboost.log", "n_jobs": 1, "model_history": True, "keep_search_state": True, "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("my_xgb2")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_random_skip_oom(self): automl_experiment = AutoML() automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM) automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": "test/sparse_classification_oom.log", "estimator_list": ["large_lgbm"], "log_type": "all", "n_jobs": 1, "hpo_method": "random", "n_concurrent_trials": 2, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) try: automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("large_lgbm")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: print("skipping concurrency test as ray is not installed") return
def test_parallel_xgboost(self, hpo_method=None): automl_experiment = AutoML() automl_settings = { "time_budget": 10, "metric": "ap", "task": "classification", "log_file_name": "test/sparse_classification.log", "estimator_list": ["xgboost"], "log_type": "all", "n_jobs": 1, "n_concurrent_trials": 2, "hpo_method": hpo_method, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) try: import ray X_train_ref = ray.put(X_train) automl_experiment.fit(X_train=X_train_ref, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: return
def test_custom_metric(self): df, y = load_iris(return_X_y=True, as_frame=True) df["label"] = y automl_experiment = AutoML() automl_settings = { "dataframe": df, "label": "label", "time_budget": 5, "eval_method": "cv", "metric": custom_metric, "task": "classification", "log_file_name": "test/iris_custom.log", "log_training_metric": True, "log_type": "all", "n_jobs": 1, "model_history": True, "sample_weight": np.ones(len(y)), "pred_time_limit": 1e-5, "ensemble": True, } automl_experiment.fit(**automl_settings) print(automl_experiment.classes_) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, task="multi" ) print(estimator) ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log( filename=automl_settings["log_file_name"], time_budget=6 ) print(metric_history) try: import ray df = ray.put(df) automl_settings["dataframe"] = df automl_settings["use_ray"] = True automl_experiment.fit(**automl_settings) except ImportError: pass
def test_custom_learner(self): automl = AutoML() automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest) X_train, y_train = load_wine(return_X_y=True) settings = { "time_budget": 10, # total running time in seconds "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'], "task": 'classification', # task type "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric "n_jobs": 1, } '''The main flaml automl API''' automl.fit(X_train=X_train, y_train=y_train, **settings) # print the best model found for RGF print(automl.best_model_for_estimator("RGF"))
def test_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "task": "regression", "log_file_name": "test/california.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = fetch_california_housing(return_X_y=True) n = int(len(y_train) * 9 // 10) automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) assert automl_experiment._state.eval_method == "holdout" print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(get_output_from_log(automl_settings["log_file_name"], 1)) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=1, ) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=0, )
def test_sparse_matrix_classification(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": "auto", "task": "classification", "log_file_name": "test/sparse_classification.log", "split_type": "uniform", "n_jobs": 1, "model_history": True, } X_train = scipy.sparse.random(1554, 21, dtype=int) y_train = np.random.randint(3, size=1554) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) print(automl_experiment.predict_proba(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("extra_tree")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_custom_learner(self): automl = AutoML() automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest) X_train, y_train = load_wine(return_X_y=True) settings = { "time_budget": 8, # total running time in seconds "estimator_list": ["RGF", "lgbm", "rf", "xgboost"], "task": "classification", # task type "sample": True, # whether to subsample training data "log_file_name": "test/wine.log", "log_training_metric": True, # whether to log training metric "n_jobs": 1, } """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings) # print the best model found for RGF print(automl.best_model_for_estimator("RGF")) MyRegularizedGreedyForest.search_space = lambda data_size, task: {} automl.fit(X_train=X_train, y_train=y_train, **settings)
def test_sparse_matrix_xgboost(self): automl_experiment = AutoML() automl_settings = { "time_budget": 3, "metric": "ap", "task": "classification", "log_file_name": "test/sparse_classification.log", "estimator_list": ["xgboost"], "log_type": "all", "n_jobs": 1, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_classification(self, as_frame=False): automl_experiment = AutoML() automl_settings = { "time_budget": 4, "metric": "accuracy", "task": "classification", "log_file_name": "test/iris.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) if as_frame: # test drop column X_train.columns = range(X_train.shape[1]) X_train[X_train.shape[1]] = np.zeros(len(y_train)) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) print(automl_experiment.predict(X_train)[:5]) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("catboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) del automl_settings["metric"] del automl_settings["model_history"] del automl_settings["log_training_metric"] automl_experiment = AutoML(task="classification") duration = automl_experiment.retrain_from_log( log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, record_id=0, ) print(duration) print(automl_experiment.model) print(automl_experiment.predict_proba(X_train)[:5])
def test_sparse_matrix_regression_holdout(self): X_train = scipy.sparse.random(8, 100) y_train = np.random.uniform(size=8) automl_experiment = AutoML() automl_settings = { "time_budget": 1, "eval_method": "holdout", "task": "regression", "log_file_name": "test/sparse_regression.log", "n_jobs": 1, "model_history": True, "metric": "mse", "sample_weight": np.ones(len(y_train)), "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_parallel(self, hpo_method=None): automl_experiment = AutoML() automl_settings = { "time_budget": 10, "task": "regression", "log_file_name": "test/california.log", "log_type": "all", "n_jobs": 1, "n_concurrent_trials": 10, "hpo_method": hpo_method, } X_train, y_train = fetch_california_housing(return_X_y=True) try: automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: return
def test_training_log(self, path="test_training_log.log", estimator_list="auto", use_ray=False): with TemporaryDirectory() as d: filename = os.path.join(d, path) # Run a simple job. automl = AutoML() automl_settings = { "time_budget": 1, "metric": "mse", "task": "regression", "log_file_name": filename, "log_training_metric": True, "mem_thres": 1024 * 1024, "n_jobs": 1, "model_history": True, "train_time_limit": 0.1, "verbose": 3, # "ensemble": True, "keep_search_state": True, "estimator_list": estimator_list, } X_train, y_train = fetch_california_housing(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) # Check if the training log file is populated. self.assertTrue(os.path.exists(filename)) if automl.best_estimator: estimator, config = automl.best_estimator, automl.best_config model0 = automl.best_model_for_estimator(estimator) print(model0.params["n_estimators"], config) # train on full data with no time limit automl._state.time_budget = None model, _ = automl._state._train_with_config(estimator, config) # assuming estimator & config are saved and loaded as follows automl = AutoML() automl.fit( X_train=X_train, y_train=y_train, max_iter=1, task="regression", estimator_list=[estimator], n_jobs=1, starting_points={estimator: config}, use_ray=use_ray, ) print(automl.best_config) # then the fitted model should be equivalent to model assert (str(model.estimator) == str(automl.model.estimator) or estimator == "xgboost" and str(model.estimator.get_dump()) == str( automl.model.estimator.get_dump()) or estimator == "catboost" and str(model.estimator.get_all_params()) == str( automl.model.estimator.get_all_params())) automl.fit( X_train=X_train, y_train=y_train, max_iter=1, task="regression", estimator_list=[estimator], n_jobs=1, starting_points={estimator: {}}, ) print(automl.best_config) with training_log_reader(filename) as reader: count = 0 for record in reader.records(): print(record) count += 1 self.assertGreater(count, 0) automl_settings["log_file_name"] = "" automl.fit(X_train=X_train, y_train=y_train, **automl_settings) automl._selected.update(None, 0) automl = AutoML() automl.fit(X_train=X_train, y_train=y_train, max_iter=0, task="regression")