def test_multioutput(): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.multioutput import MultiOutputRegressor, RegressorChain # create regression data X, y = make_regression(n_targets=3) # split into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # train the model model = MultiOutputRegressor(AutoML(task="regression", time_budget=1)) model.fit(X_train, y_train) # predict print(model.predict(X_test)) # train the model model = RegressorChain(AutoML(task="regression", time_budget=1)) model.fit(X_train, y_train) # predict print(model.predict(X_test))
def test_regression_xgboost(self): X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1) automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2) automl_settings = { "time_budget": 2, "estimator_list": ["my_xgb1", "my_xgb2"], "task": "regression", "log_file_name": "test/regression_xgboost.log", "n_jobs": 1, "model_history": True, "keep_search_state": True, "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("my_xgb2")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_sparse_matrix_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'mae', "task": 'regression', "log_file_name": "test/sparse_regression.log", "model_history": True } X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_training_log(self): with TemporaryDirectory() as d: filename = os.path.join(d, 'test_training_log.log') # Run a simple job. automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'mse', "task": 'regression', "log_file_name": filename, "log_training_metric": True, "mem_thres": 1024*1024, "n_jobs": 1, "model_history": True } X_train, y_train = load_boston(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) # Check if the training log file is populated. self.assertTrue(os.path.exists(filename)) with training_log_reader(filename) as reader: count = 0 for record in reader.records(): print(record) count += 1 self.assertGreater(count, 0)
def test_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'mse', "task": 'regression', "log_file_name": "test/boston.log", "log_training_metric": True, "model_history": True } X_train, y_train = load_boston(return_X_y=True) n = len(y_train) automl_experiment.fit(X_train=X_train[:n >> 1], y_train=y_train[:n >> 1], X_val=X_train[n >> 1:], y_val=y_train[n >> 1:], **automl_settings) assert automl_experiment.y_val.shape[0] == n - (n >> 1) assert automl_experiment.eval_method == 'holdout' print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(get_output_from_log(automl_settings["log_file_name"], 1))
def _test_custom_data(): from flaml import AutoML import requests import pandas as pd try: train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3) dev_dataset = pd.read_csv("data/input/dev.tsv", delimiter="\t", quoting=3) test_dataset = pd.read_csv("data/input/test.tsv", delimiter="\t", quoting=3) except requests.exceptions.HTTPError: return custom_sent_keys = ["#1 String", "#2 String"] label_key = "Quality" X_train = train_dataset[custom_sent_keys] y_train = train_dataset[label_key] X_val = dev_dataset[custom_sent_keys] y_val = dev_dataset[label_key] X_test = test_dataset[custom_sent_keys] automl = AutoML() automl_settings = { "gpu_per_trial": 0, "max_iter": 3, "time_budget": 5, "task": "seq-classification", "metric": "accuracy", } automl_settings["custom_hpo_args"] = { "model_path": "google/electra-small-discriminator", "output_dir": "data/output/", "ckpt_per_epoch": 1, } automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) automl.predict(X_test) automl.predict(["test test"]) automl.predict([ ["test test", "test test"], ["test test", "test test"], ["test test", "test test"], ])
def _test_ray_classification(): from sklearn.datasets import make_classification X, y = make_classification(1000, 10) automl = AutoML() automl.fit(X, y, time_budget=10, task="classification", n_concurrent_trials=2)
def test_classification(self, as_frame=False): automl_experiment = AutoML() automl_settings = { "time_budget": 4, "metric": 'accuracy', "task": 'classification', "log_file_name": "test/iris.log", "log_training_metric": True, "model_history": True } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) print(automl_experiment.predict_proba(X_train)[:5]) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) del automl_settings["metric"] del automl_settings["model_history"] del automl_settings["log_training_metric"] automl_experiment = AutoML() duration = automl_experiment.retrain_from_log( log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, record_id=0) print(duration) print(automl_experiment.model) print(automl_experiment.predict_proba(X_train)[:5])
def test_roc_auc_ovo(self): automl_experiment = AutoML() automl_settings = { "time_budget": 1, "metric": "roc_auc_ovo", "task": "classification", "log_file_name": "test/roc_auc_ovo.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_iris(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
def test_custom_metric(self): df, y = load_iris(return_X_y=True, as_frame=True) df["label"] = y automl_experiment = AutoML() automl_settings = { "dataframe": df, "label": "label", "time_budget": 5, "eval_method": "cv", "metric": custom_metric, "task": "classification", "log_file_name": "test/iris_custom.log", "log_training_metric": True, "log_type": "all", "n_jobs": 1, "model_history": True, "sample_weight": np.ones(len(y)), "pred_time_limit": 1e-5, "ensemble": True, } automl_experiment.fit(**automl_settings) print(automl_experiment.classes_) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, task="multi" ) print(estimator) ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log( filename=automl_settings["log_file_name"], time_budget=6 ) print(metric_history) try: import ray df = ray.put(df) automl_settings["dataframe"] = df automl_settings["use_ray"] = True automl_experiment.fit(**automl_settings) except ImportError: pass
def test_cv(): from flaml import AutoML import pandas as pd import requests train_data = { "sentence1": [ 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .", "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .", ], "sentence2": [ 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .", "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .", ], "label": [1, 0, 1, 0], "idx": [0, 1, 2, 3], } train_dataset = pd.DataFrame(train_data) custom_sent_keys = ["sentence1", "sentence2"] label_key = "label" X_train = train_dataset[custom_sent_keys] y_train = train_dataset[label_key] automl = AutoML() automl_settings = { "gpu_per_trial": 0, "max_iter": 3, "time_budget": 5, "task": "seq-classification", "metric": "accuracy", "n_splits": 3, } automl_settings["custom_hpo_args"] = { "model_path": "google/electra-small-discriminator", "output_dir": "test/data/output/", "ckpt_per_epoch": 1, "fp16": False, } try: automl.fit(X_train=X_train, y_train=y_train, **automl_settings) except requests.exceptions.HTTPError: return
def test_custom_metric(self): automl_experiment = AutoML() automl_settings = { "time_budget": 10, 'eval_method': 'holdout', "metric": custom_metric, "task": 'classification', "log_file_name": "test/iris_custom.log", "log_training_metric": True, 'log_type': 'all', "model_history": True } X_train, y_train = load_iris(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) print(automl_experiment.predict_proba(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, objective='multi') print(estimator) time_history, best_valid_loss_history, valid_loss_history, \ config_history, train_loss_history = get_output_from_log( filename=automl_settings['log_file_name'], time_budget=6) print(train_loss_history)
def test_sparse_matrix_lr(self): automl_experiment = AutoML() automl_settings = { "time_budget": 3, "metric": "f1", "task": "classification", "log_file_name": "test/sparse_classification.log", "estimator_list": ["lrl1", "lrl2"], "log_type": "all", "n_jobs": 1, } X_train = scipy.sparse.random(3000, 3000, density=0.1) y_train = np.random.randint(2, size=3000) automl_experiment.fit(X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings) automl_settings["time_budget"] = 5 automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("lrl2")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_random_skip_oom(self): automl_experiment = AutoML() automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM) automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": "test/sparse_classification_oom.log", "estimator_list": ["large_lgbm"], "log_type": "all", "n_jobs": 1, "hpo_method": "random", "n_concurrent_trials": 2, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) try: automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("large_lgbm")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: print("skipping concurrency test as ray is not installed") return
def test_micro_macro_f1(self): automl_experiment_micro = AutoML() automl_experiment_macro = AutoML() automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": "test/micro_macro_f1.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_iris(return_X_y=True) automl_experiment_micro.fit( X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings ) automl_experiment_macro.fit( X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings ) estimator = automl_experiment_macro.model y_pred = estimator.predict(X_train) y_pred_proba = estimator.predict_proba(X_train) from flaml.ml import norm_confusion_matrix, multi_class_curves print(norm_confusion_matrix(y_train, y_pred)) from sklearn.metrics import roc_curve, precision_recall_curve print(multi_class_curves(y_train, y_pred_proba, roc_curve)) print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
def test_binary(self): automl_experiment = AutoML() automl_settings = { "time_budget": 1, "task": "binary", "log_file_name": "test/breast_cancer.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_breast_cancer(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) _ = automl_experiment.predict(X_train)
def test_roc_auc_ovr(self): automl_experiment = AutoML() X_train, y_train = load_iris(return_X_y=True) automl_settings = { "time_budget": 1, "metric": "roc_auc_ovr", "task": "classification", "log_file_name": "test/roc_auc_ovr.log", "log_training_metric": True, "n_jobs": 1, "sample_weight": np.ones(len(y_train)), "eval_method": "holdout", "model_history": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
def test_numpy_large(): import numpy as np import pandas as pd from flaml import AutoML X_train = pd.date_range("2017-01-01", periods=70000, freq="T") y_train = pd.DataFrame(np.random.randint(6500, 7500, 70000)) automl = AutoML() automl.fit( X_train=X_train[:-10].values, # a single column of timestamp y_train=y_train[:-10].values, # value for each timestamp period=10, # time horizon to forecast, e.g., 12 months task="ts_forecast", time_budget=10, # time budget in seconds )
def test_fit_w_starting_point(self, as_frame=True): automl_experiment = AutoML() automl_settings = { "time_budget": 3, "metric": "accuracy", "task": "classification", "log_file_name": "test/iris.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) if as_frame: # test drop column X_train.columns = range(X_train.shape[1]) X_train[X_train.shape[1]] = np.zeros(len(y_train)) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) automl_val_accuracy = 1.0 - automl_experiment.best_loss print("Best ML leaner:", automl_experiment.best_estimator) print("Best hyperparmeter config:", automl_experiment.best_config) print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy)) print( "Training duration of best run: {0:.4g} s".format( automl_experiment.best_config_train_time ) ) starting_points = automl_experiment.best_config_per_estimator print("starting_points", starting_points) print("loss of the starting_points", automl_experiment.best_loss_per_estimator) automl_settings_resume = { "time_budget": 2, "metric": "accuracy", "task": "classification", "log_file_name": "test/iris_resume.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, "log_type": "all", "starting_points": starting_points, } new_automl_experiment = AutoML() new_automl_experiment.fit( X_train=X_train, y_train=y_train, **automl_settings_resume ) new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss print("Best ML leaner:", new_automl_experiment.best_estimator) print("Best hyperparmeter config:", new_automl_experiment.best_config) print( "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy) ) print( "Training duration of best run: {0:.4g} s".format( new_automl_experiment.best_config_train_time ) )
def test_logging_level(self): from flaml import logger, logger_formatter with tempfile.TemporaryDirectory() as d: training_log = os.path.join(d, "training.log") # Configure logging for the FLAML logger # and add a handler that outputs to a buffer. logger.setLevel(logging.INFO) buf = io.StringIO() ch = logging.StreamHandler(buf) ch.setFormatter(logger_formatter) logger.addHandler(ch) # Run a simple job. automl = AutoML() automl_settings = { "time_budget": 1, "metric": 'mse', "task": 'regression', "log_file_name": training_log, "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = load_boston(return_X_y=True) n = len(y_train) >> 1 automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) # Check if the log buffer is populated. self.assertTrue(len(buf.getvalue()) > 0) import pickle with open('automl.pkl', 'wb') as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) print(automl.__version__)
def test_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "task": "regression", "log_file_name": "test/california.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = fetch_california_housing(return_X_y=True) n = int(len(y_train) * 9 // 10) automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) assert automl_experiment._state.eval_method == "holdout" print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(get_output_from_log(automl_settings["log_file_name"], 1)) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=1, ) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=0, )
def test_sparse_matrix_regression_cv(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, 'eval_method': 'cv', "task": 'regression', "log_file_name": "test/sparse_regression.log", "model_history": True } X_train = scipy.sparse.random(100, 100) y_train = np.random.uniform(size=100) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)
def test_parallel_xgboost(self, hpo_method=None): automl_experiment = AutoML() automl_settings = { "time_budget": 10, "metric": "ap", "task": "classification", "log_file_name": "test/sparse_classification.log", "estimator_list": ["xgboost"], "log_type": "all", "n_jobs": 1, "n_concurrent_trials": 2, "hpo_method": hpo_method, } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) try: import ray X_train_ref = ray.put(X_train) automl_experiment.fit(X_train=X_train_ref, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) except ImportError: return
def test_ray_classification(self): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) automl = AutoML() try: automl.fit( X_train, y_train, X_val=X_test, y_val=y_test, time_budget=10, task="classification", use_ray=True, ) automl.fit( X_train, y_train, X_val=X_test, y_val=y_test, time_budget=10, task="classification", n_concurrent_trials=2, ) except ImportError: return
def test_mlflow(): import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"]) import mlflow from flaml.data import load_openml_task try: X_train, X_test, y_train, y_test = load_openml_task( task_id=7592, data_dir="test/" ) except (OpenMLServerException, ChunkedEncodingError) as e: print(e) return """ import AutoML class from flaml package """ from flaml import AutoML automl = AutoML() settings = { "time_budget": 5, # total running time in seconds "metric": "accuracy", # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2'] "estimator_list": ["lgbm", "rf", "xgboost"], # list of ML learners "task": "classification", # task type "sample": False, # whether to subsample training data "log_file_name": "adult.log", # flaml log file } mlflow.set_experiment("flaml") with mlflow.start_run() as run: automl.fit(X_train=X_train, y_train=y_train, **settings) mlflow.sklearn.log_model(automl, "automl") loaded_model = mlflow.pyfunc.load_model(f"{run.info.artifact_uri}/automl") print(loaded_model.predict(X_test)) automl._mem_thres = 0 print(automl.trainable(automl.points_to_evaluate[0])) settings["use_ray"] = True try: with mlflow.start_run() as run: automl.fit(X_train=X_train, y_train=y_train, **settings) mlflow.sklearn.log_model(automl, "automl") automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl") print(automl.predict_proba(X_test)) except ImportError: pass
def test_sparse_matrix_regression(self): X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": "mae", "task": "regression", "log_file_name": "test/sparse_regression.log", "n_jobs": 1, "model_history": True, "keep_search_state": True, "verbose": 0, "early_stop": True, } automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) assert automl_experiment._state.X_val.shape == X_val.shape print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(automl_experiment.best_config) print(automl_experiment.best_loss) print(automl_experiment.best_config_train_time)
def test_datetime_columns(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'mse', "task": 'regression', "log_file_name": "test/datetime_columns.log", "log_training_metric": True, "n_jobs": 1, "model_history": True } fake_df = pd.DataFrame({ 'A': [ datetime(1900, 2, 3), datetime(1900, 3, 4), datetime(1900, 3, 4), datetime(1900, 3, 4), datetime(1900, 7, 2), datetime(1900, 8, 9) ], 'B': [ datetime(1900, 1, 1), datetime(1900, 1, 1), datetime(1900, 1, 1), datetime(1900, 1, 1), datetime(1900, 1, 1), datetime(1900, 1, 1) ], 'year_A': [ datetime(1900, 1, 2), datetime(1900, 8, 1), datetime(1900, 1, 4), datetime(1900, 6, 1), datetime(1900, 1, 5), datetime(1900, 4, 1) ] }) y = np.array([0, 1, 0, 1, 0, 0]) automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings) _ = automl_experiment.predict(fake_df)
def test_xgboost(): from flaml import AutoML from sklearn.datasets import make_moons import scipy.sparse import numpy as np from xgboost.core import XGBoostError try: X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) automl = AutoML() automl.fit( X_train, y_train, estimator_list=["xgb_limitdepth", "xgboost"], time_budget=5, gpu_per_trial=1, ) train, label = make_moons( n_samples=300000, shuffle=True, noise=0.3, random_state=None ) automl = AutoML() automl.fit( train, label, estimator_list=["xgb_limitdepth", "xgboost"], time_budget=5, gpu_per_trial=1, ) automl.fit( train, label, estimator_list=["xgb_limitdepth", "xgboost"], time_budget=5, ) except XGBoostError: # No visible GPU is found for XGBoost. return
def test_package_minimum(): # Initialize an AutoML instance automl = AutoML() # Specify automl goal and constraint automl_settings = { "time_budget": 10, # in seconds "metric": "accuracy", "task": "classification", "log_file_name": "iris.log", } X_train, y_train = load_iris(return_X_y=True) # Train with labeled input data automl.fit(X_train=X_train, y_train=y_train, **automl_settings) # Check that `best_config` is created, the log was created and best model is accessible assert hasattr(automl, "best_config") assert Path("iris.log").exists() assert automl.model is not None print(automl.model) # Predict and check that the prediction shape is as expected preds = automl.predict_proba(X_train) assert preds.shape == (150, 3) print(preds)
def test_sparse_matrix_xgboost(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'ap', "task": 'classification', "log_file_name": "test/sparse_classification.log", "estimator_list": ["xgboost"], "log_type": "all", } X_train = scipy.sparse.eye(900000) y_train = np.random.randint(2, size=900000) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator)