def test_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "metric": 'mse', "task": 'regression', "log_file_name": "test/boston.log", "log_training_metric": True, "n_jobs": 1, "model_history": True } X_train, y_train = load_boston(return_X_y=True) n = int(len(y_train) * 9 // 10) automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) assert automl_experiment._state.eval_method == 'holdout' print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(get_output_from_log(automl_settings["log_file_name"], 1))
def test_custom_metric(self): X_train, y_train = load_iris(return_X_y=True) automl_experiment = AutoML() automl_settings = { "time_budget": 10, 'eval_method': 'holdout', "metric": custom_metric, "task": 'classification', "log_file_name": "test/iris_custom.log", "log_training_metric": True, 'log_type': 'all', "n_jobs": 1, "model_history": True, "sample_weight": np.ones(len(y_train)), } automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) print(automl_experiment.predict_proba(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.model_history) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, task='multi') print(estimator) time_history, best_valid_loss_history, valid_loss_history, \ config_history, train_loss_history = get_output_from_log( filename=automl_settings['log_file_name'], time_budget=6) print(train_loss_history)
def test_custom_metric(self): df, y = load_iris(return_X_y=True, as_frame=True) df["label"] = y automl_experiment = AutoML() automl_settings = { "dataframe": df, "label": "label", "time_budget": 5, "eval_method": "cv", "metric": custom_metric, "task": "classification", "log_file_name": "test/iris_custom.log", "log_training_metric": True, "log_type": "all", "n_jobs": 1, "model_history": True, "sample_weight": np.ones(len(y)), "pred_time_limit": 1e-5, "ensemble": True, } automl_experiment.fit(**automl_settings) print(automl_experiment.classes_) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("rf")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( automl_settings["log_file_name"], record_id=0, task="multi" ) print(estimator) ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log( filename=automl_settings["log_file_name"], time_budget=6 ) print(metric_history) try: import ray df = ray.put(df) automl_settings["dataframe"] = df automl_settings["use_ray"] = True automl_experiment.fit(**automl_settings) except ImportError: pass
def test_regression(self): automl_experiment = AutoML() automl_settings = { "time_budget": 2, "task": "regression", "log_file_name": "test/california.log", "log_training_metric": True, "n_jobs": 1, "model_history": True, } X_train, y_train = fetch_california_housing(return_X_y=True) n = int(len(y_train) * 9 // 10) automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings) assert automl_experiment._state.eval_method == "holdout" print(automl_experiment.predict(X_train)) print(automl_experiment.model) print(automl_experiment.config_history) print(automl_experiment.best_model_for_estimator("xgboost")) print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) print(get_output_from_log(automl_settings["log_file_name"], 1)) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=1, ) automl_experiment.retrain_from_log( task="regression", log_file_name=automl_settings["log_file_name"], X_train=X_train, y_train=y_train, train_full=True, time_budget=0, )
def test_automl(budget=5, dataset_format="dataframe", hpo_method=None): from flaml.data import load_openml_dataset try: X_train, X_test, y_train, y_test = load_openml_dataset( dataset_id=1169, data_dir="test/", dataset_format=dataset_format ) except (OpenMLServerException, ChunkedEncodingError) as e: print(e) return """ import AutoML class from flaml package """ from flaml import AutoML automl = AutoML() settings = { "time_budget": budget, # total running time in seconds "metric": "accuracy", # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2'] "task": "classification", # task type "log_file_name": "airlines_experiment.log", # flaml log file "seed": 7654321, # random seed "hpo_method": hpo_method, } """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings) """ retrieve best config and best learner """ print("Best ML leaner:", automl.best_estimator) print("Best hyperparmeter config:", automl.best_config) print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss)) print( "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time) ) print(automl.model.estimator) print("time taken to find best model:", automl.time_to_find_best_model) """ pickle and save the automl object """ import pickle with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) """ compute predictions of testing dataset """ y_pred = automl.predict(X_test) print("Predicted labels", y_pred) print("True labels", y_test) y_pred_proba = automl.predict_proba(X_test)[:, 1] """ compute different metric values on testing dataset """ from flaml.ml import sklearn_metric_loss_score print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test)) print( "roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test) ) print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test)) from flaml.data import get_output_from_log ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6) for config in config_history: print(config) print(automl.resource_attr) print(automl.max_resource) print(automl.min_resource) automl.fit(X_train=X_train, y_train=y_train, ensemble=True, **settings)
mlflow.log_param('TIME_BUDGET', TIME_BUDGET) mlflow.log_params(settings) automl.fit(X_train=X_train, y_train=y_train, X_val=X_test, y_val=y_test, **settings) print('### AUTO ML') print('Best hyperparmeter config:', automl.best_config) print('Best log_loss on validation data: {0:.4g}'.format( automl.best_loss)) print('Training duration of best run: {0:.4g} s'.format( automl.best_config_train_time)) time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \ get_output_from_log(filename=settings['log_file_name'], time_budget=TIME_BUDGET) plt.title(f'Learning Curve - {m}') plt.xlabel('Wall Clock Time (s)') plt.ylabel('Validation Accuracy') plt.scatter(time_history, 1 - np.array(valid_loss_history)) plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post') file_path = f'{PLOTS_ROOT}/automl_learning_curve_{m}.png' plt.savefig(file_path) plt.close() mlflow.log_artifact(file_path) y_pred = automl.predict(X_test) y_prob = automl.predict_proba(X_test)[:, 1]
def test_forecast_automl(budget=5): # using dataframe import statsmodels.api as sm data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean() data = (data.fillna( data.bfill()).to_frame().reset_index().rename(columns={ "index": "ds", "co2": "y" })) num_samples = data.shape[0] time_horizon = 12 split_idx = num_samples - time_horizon df = data[:split_idx] X_test = data[split_idx:]["ds"] y_test = data[split_idx:]["y"] automl = AutoML() settings = { "time_budget": budget, # total running time in seconds "metric": "mape", # primary metric "task": "ts_forecast", # task type "log_file_name": "test/CO2_forecast.log", # flaml log file "eval_method": "holdout", "label": "y", } """The main flaml automl API""" try: import prophet automl.fit(dataframe=df, **settings, period=time_horizon) except ImportError: print("not using prophet due to ImportError") automl.fit( dataframe=df, **settings, estimator_list=["arima", "sarimax"], period=time_horizon, ) """ retrieve best config and best learner""" print("Best ML leaner:", automl.best_estimator) print("Best hyperparmeter config:", automl.best_config) print(f"Best mape on validation data: {automl.best_loss}") print(f"Training duration of best run: {automl.best_config_train_time}s") print(automl.model.estimator) """ pickle and save the automl object """ import pickle with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) """ compute predictions of testing dataset """ y_pred = automl.predict(X_test) print("Predicted labels", y_pred) print("True labels", y_test) """ compute different metric values on testing dataset""" from flaml.ml import sklearn_metric_loss_score print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test)) from flaml.data import get_output_from_log ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget) for config in config_history: print(config) print(automl.resource_attr) print(automl.max_resource) print(automl.min_resource) X_train = df[["ds"]] y_train = df["y"] automl = AutoML() try: automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon) except ImportError: print("not using prophet due to ImportError") automl.fit( X_train=X_train, y_train=y_train, **settings, estimator_list=["arima", "sarimax"], period=time_horizon, )
def test_forecast_classification(budget=5): from hcrystalball.utils import get_sales_data from hcrystalball.wrappers import get_sklearn_wrapper time_horizon = 30 df = get_sales_data(n_dates=180, n_assortments=1, n_states=1, n_stores=1) df = df[["Sales", "Open", "Promo", "Promo2"]] # feature engineering import numpy as np df["above_mean_sales"] = np.where(df["Sales"] > df["Sales"].mean(), 1, 0) df.reset_index(inplace=True) train_df = df[:-time_horizon] test_df = df[-time_horizon:] X_train, X_test = ( train_df[["Date", "Open", "Promo", "Promo2"]], test_df[["Date", "Open", "Promo", "Promo2"]], ) y_train, y_test = train_df["above_mean_sales"], test_df["above_mean_sales"] automl = AutoML() settings = { "time_budget": budget, # total running time in seconds "metric": "accuracy", # primary metric "task": "ts_forecast_classification", # task type "log_file_name": "test/sales_classification_forecast.log", # flaml log file "eval_method": "holdout", } """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon) """ retrieve best config and best learner""" print("Best ML leaner:", automl.best_estimator) print("Best hyperparmeter config:", automl.best_config) print(f"Best mape on validation data: {automl.best_loss}") print(f"Training duration of best run: {automl.best_config_train_time}s") print(automl.model.estimator) """ pickle and save the automl object """ import pickle with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) """ compute predictions of testing dataset """ y_pred = automl.predict(X_test) """ compute different metric values on testing dataset""" from flaml.ml import sklearn_metric_loss_score print(y_test) print(y_pred) print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_test, y_pred)) from flaml.data import get_output_from_log ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget) for config in config_history: print(config) print(automl.resource_attr) print(automl.max_resource) print(automl.min_resource)
def test_multivariate_forecast_cat(budget=5): time_horizon = 180 train_df, test_df = load_multi_dataset_cat(time_horizon) X_test = test_df[[ "timeStamp", "season", "above_monthly_avg" ]] # test dataframe must contain values for the regressors / multivariate variables y_test = test_df["demand"] automl = AutoML() settings = { "time_budget": budget, # total running time in seconds "metric": "mape", # primary metric "task": "ts_forecast", # task type "log_file_name": "test/energy_forecast_categorical.log", # flaml log file "eval_method": "holdout", "log_type": "all", "label": "demand", } """The main flaml automl API""" try: import prophet automl.fit(dataframe=train_df, **settings, period=time_horizon) except ImportError: print("not using prophet due to ImportError") automl.fit( dataframe=train_df, **settings, estimator_list=["arima", "sarimax"], period=time_horizon, ) """ retrieve best config and best learner""" print("Best ML leaner:", automl.best_estimator) print("Best hyperparmeter config:", automl.best_config) print(f"Best mape on validation data: {automl.best_loss}") print(f"Training duration of best run: {automl.best_config_train_time}s") print(automl.model.estimator) """ pickle and save the automl object """ import pickle with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) """ compute predictions of testing dataset """ y_pred = automl.predict(X_test) print("Predicted labels", y_pred) print("True labels", y_test) """ compute different metric values on testing dataset""" from flaml.ml import sklearn_metric_loss_score print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test)) print("rmse", "=", sklearn_metric_loss_score("rmse", y_pred, y_test)) print("mse", "=", sklearn_metric_loss_score("mse", y_pred, y_test)) print("mae", "=", sklearn_metric_loss_score("mae", y_pred, y_test)) from flaml.data import get_output_from_log ( time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history, ) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget) for config in config_history: print(config) print(automl.resource_attr) print(automl.max_resource) print(automl.min_resource)