def test_homogenous(var_type): dim = 5 def fitness(_): return np.random.rand() if var_type == "r": lb, ub = -1, 5 space = RealSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=None) thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( mean=mean, corr="squared_exponential", theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer="BFGS", wait_iter=3, random_start=dim, likelihood="concentrated", eval_budget=100 * dim, ) else: if var_type == "b": space = BoolSpace() * dim elif var_type == "i": space = IntegerSpace([0, 10], step=1) * dim elif var_type == "c": space = DiscreteSpace(list(range(10))) * dim elif var_type == "o": space = OrdinalSpace(list(string.ascii_lowercase)) elif var_type == "s": space = SubsetSpace(list(string.ascii_lowercase)[:5]) model = RandomForest(levels=space.levels) opt = BO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=10, verbose=True, n_point=1, ) print(opt.run())
def test_BO_equality(): dim = 2 search_space = RealSpace([0, 1]) * dim thetaL = 1e-5 * np.ones(dim) thetaU = np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( corr="squared_exponential", theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=1e-1, random_state=42, ) xopt, _, __ = BO( search_space=search_space, obj_fun=obj_fun, eq_fun=h, model=model, max_FEs=20, DoE_size=3, acquisition_fun="MGFI", acquisition_par={"t": 2}, acquisition_optimization={"optimizer": "BFGS"}, verbose=True, random_seed=42, ).run() assert np.isclose(h(xopt), 0, atol=1e-1)
def test_BO_constraints(): search_space = ( IntegerSpace([1, 10], var_name="mu") + IntegerSpace([1, 10], var_name="lambda") + RealSpace([0, 1], var_name="pc") + RealSpace([0.005, 0.5], var_name="p") ) model = RandomForest(levels=search_space.levels) xopt, _, __ = BO( search_space=search_space, obj_fun=obj_fun2, ineq_fun=g, model=model, max_FEs=10, DoE_size=3, eval_type="dict", acquisition_fun="MGFI", acquisition_par={"t": 2}, n_job=1, n_point=1, verbose=True, random_seed=42, ).run() assert isinstance(xopt, dict) assert all(np.array(g(xopt)) <= 0)
def test_BO(dim, obj_fun, ftarget, max_FEs, lb, ub, logfile): space = RealSpace(list(zip(lb, ub))) mean = trend.constant_trend(dim, beta=None) # equivalent to Ordinary Kriging thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( mean=mean, corr="matern", theta0=theta0, thetaL=thetaL, thetaU=thetaU, noise_estim=False, nugget=1e-6, optimizer="BFGS", wait_iter=5, random_start=5 * dim, likelihood="concentrated", eval_budget=100 * dim, ) return BO( search_space=space, obj_fun=obj_fun, model=model, DoE_size=dim * 5, max_FEs=max_FEs, verbose=False, n_point=1, minimize=True, ftarget=ftarget, logger=logfile, )
def test_BO(dim, obj_fun, ftarget, max_FEs, lb, ub, logfile): sys.path.insert(0, '../') from bayes_optim import AnnealingBO, BO, ContinuousSpace from bayes_optim.Surrogate import GaussianProcess, trend space = ContinuousSpace(list(zip(lb, ub))) mean = trend.constant_trend(dim, beta=None) # equivalent to Ordinary Kriging thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess(mean=mean, corr='matern', theta0=theta0, thetaL=thetaL, thetaU=thetaU, noise_estim=False, nugget=1e-6, optimizer='BFGS', wait_iter=5, random_start=5 * dim, likelihood='concentrated', eval_budget=100 * dim) return BO(search_space=space, obj_fun=obj_fun, model=model, DoE_size=dim * 5, max_FEs=max_FEs, verbose=False, n_point=1, minimize=True, ftarget=ftarget, logger=logfile)
def test_warm_data_with_RF(): space = ContinuousSpace([-10, 10]) * 2 + \ OrdinalSpace([5, 15]) + \ NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G']) X = space.sampling(10) y = [obj_fun(x) for x in X] model = RandomForest(levels=space.levels) opt = BO(search_space=space, obj_fun=obj_fun, model=model, minimize=True, eval_type='list', max_FEs=10, verbose=True, acquisition_fun='EI', warm_data=(X, y)) opt.run() assert opt.data.shape[0] == 20
def test_infeasible_constraints(): dim = 5 lb, ub = -5, 5 def fitness(_): return 1 space = RealSpace([lb, ub]) * dim model = RandomForest(levels=space.levels) opt = BO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, ineq_fun=lambda x: x[0] + 5.1, max_FEs=10, verbose=True, n_point=1, ) with pytest.raises(AskEmptyError): opt.run()
def test_warm_data_with_RF(): space = (RealSpace([-10, 10]) * 2 + IntegerSpace([5, 15]) + DiscreteSpace(["OK", "A", "B", "C", "D", "E", "F", "G"])) X = space.sample(10) y = [obj_fun(x) for x in X] model = RandomForest(levels=space.levels) opt = BO( search_space=space, obj_fun=obj_fun, model=model, minimize=True, eval_type="list", max_FEs=5, verbose=True, acquisition_fun="EI", warm_data=(X, y), ) opt.run() assert opt.data.shape[0] == 15
def test_warm_data_with_GPR(): dim = 2 lb, ub = -5, 5 def fitness(x): x = np.asarray(x) return np.sum(x**2) X = np.random.rand(5, dim) * (ub - lb) + lb y = [fitness(x) for x in X] space = RealSpace([lb, ub]) * dim thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer="BFGS", wait_iter=3, random_start=dim, likelihood="concentrated", eval_budget=100 * dim, ) opt = BO( search_space=space, obj_fun=fitness, model=model, warm_data=(X, y), max_FEs=10, verbose=True, n_point=1, ) assert np.all(np.asarray(opt.data) == np.asarray(opt.warm_data)) assert opt.model.is_fitted opt.run()
def test_flat_continuous(): dim = 5 lb, ub = -1, 5 def fitness(_): return 1 space = RealSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=None) thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( mean=mean, corr="squared_exponential", theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer="BFGS", wait_iter=3, random_start=dim, likelihood="concentrated", eval_budget=100 * dim, ) opt = BO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=10, verbose=True, n_point=1, ) with pytest.raises(FlatFitnessError): opt.run()
def test_continuous(): dim = 5 lb, ub = -1, 5 def fitness(x): x = np.asarray(x) return np.sum(x**2) space = ContinuousSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=None) thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess(mean=mean, corr='squared_exponential', theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer='BFGS', wait_iter=3, random_start=dim, likelihood='concentrated', eval_budget=100 * dim) opt = BO(search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=10, verbose=True, n_point=1) print(opt.run())
def test_BO(dim, obj_fun, ftarget, max_FEs, lb, ub, logfile): sys.path.insert(0, "../") from bayes_optim import BO, AnnealingBO, RealSpace from bayes_optim.Surrogate import GaussianProcess, trend space = RealSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=0) # equivalent to Ordinary Kriging thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( mean=mean, corr="matern", theta0=theta0, thetaL=thetaL, thetaU=thetaU, noise_estim=False, nugget=1e-6, optimizer="BFGS", wait_iter=5, random_start=5 * dim, likelihood="concentrated", eval_budget=100 * dim, ) return BO( search_space=space, obj_fun=obj_fun, model=model, DoE_size=dim * 5, max_FEs=max_FEs, verbose=False, n_point=1, minimize=True, ftarget=ftarget, logger=logfile, )
def test_BO_bad_constraints(): search_space = ( DiscreteSpace(["1", "2", "3"], var_name="lambda") + RealSpace([0, 1], var_name="pc") + RealSpace([0.005, 0.5], var_name="p") ) model = RandomForest(levels=search_space.levels) with pytest.raises(ConstraintEvaluationError): BO( search_space=search_space, obj_fun=lambda x: 10 * (x[0] == "3") + x[1] * x[2], ineq_fun=lambda x: sum(np.array(x) ** 2), model=model, max_FEs=10, DoE_size=3, eval_type="list", acquisition_fun="MGFI", acquisition_par={"t": 2}, n_job=1, n_point=1, verbose=True, random_seed=42, ).run()
def test_BO_constraints(): search_space = OrdinalSpace([1, 10], var_name='mu') + \ OrdinalSpace([1, 10], var_name='lambda') + \ ContinuousSpace([0, 1], var_name='pc') + \ ContinuousSpace([0.005, 0.5], var_name='p') model = RandomForest(levels=search_space.levels) xopt, _, __ = BO(search_space=search_space, obj_fun=obj_func, ineq_fun=g, model=model, max_FEs=30, DoE_size=3, eval_type='dict', acquisition_fun='MGFI', acquisition_par={ 't': 2 }, n_job=1, n_point=1, verbose=True).run() assert isinstance(xopt, dict) assert all(np.array(g(xopt)) <= 0)
def test_pickling(): dim = 5 lb, ub = -1, 5 def fitness(x): x = np.asarray(x) return np.sum(x**2) space = RealSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=None) thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( mean=mean, corr="squared_exponential", theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer="BFGS", wait_iter=3, random_start=dim, likelihood="concentrated", eval_budget=100 * dim, ) opt = BO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=10, verbose=True, n_point=1, log_file="log", ) opt.save("test") opt = BO.load("test") print(opt.run()) os.remove("test") os.remove("log") opt = ParallelBO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=10, verbose=True, n_point=3, log_file="log", ) opt.save("test") opt = BO.load("test") print(opt.run()) os.remove("test") os.remove("log")
def modeling(train, targets, to_optimize, **kwargs): """ Training and performing hyperparmeter optimization by Bayesian Optimization. Currently only supporting Random Forests. TODO: Make the HO and train_seting more interactive :param to_optimize: perform or not HO (boolean) :param train: train set (pandas) :param targets: targets (labels) (np.arrays) :cv: CV count for hyperparameter optimization :to_drop: Features to be dropped from learning such as unit numbers, cycles, etc (list of string names) :DoE_size: Initial design of experiment for the BO HO. :max_FEs: maximum number of function evaluations of the BO HO :features_list= a list of features to use, cv= :return: trained model and list of used features """ start = time.time() features_list = kwargs.get('features_list', None) to_drop = kwargs.get('to_drop', None) cv = kwargs.get('cv', 10) DoE_size = kwargs.get('DoE_size', 200) max_FEs = kwargs.get('max_FEs', 20) train_set = train.copy() if to_drop: print(f'The following features will not be used in training: {to_drop}') train_set.drop(to_drop, axis=1, inplace=True) if features_list: print('Features selected by user') train_set = train_set[features_list] train_set = train_set.values else: print('Feature Selection (this will take a while...)') train_set, features_list = boruta_feature_selection(train_set, targets) with open('./features_list.pkl', 'wb') as f: pkl.dump(features_list, f) df_columns = ['acc', 'max_depth', 'n_estimators', 'bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split'] df_eval = pd.DataFrame(columns=df_columns) # Hyperparameter optimization # objective function def obj_func(x): # logger.info('Started internal cross-validation') nonlocal df_eval performance_ = [] skf = StratifiedKFold(n_splits=cv, random_state=np.random, shuffle=True) for train_set_index, test_index in tqdm(skf.split(train_set, targets), 'Optimizing HO'): X_train_set, X_test = train_set[train_set_index], train_set[test_index] y_train_set, y_test = targets[train_set_index], targets[test_index] rf_ = RandomForestClassifier(n_estimators=int(x[1]), max_depth=int(x[0]), bootstrap=x[2], max_features=x[3], min_samples_leaf=x[4], min_samples_split=x[5], n_jobs=-1) rf_.fit(X_train_set, y_train_set) predictions_ = rf_.predict(X_test) performance_.append(accuracy_score(y_test, predictions_)) val = np.mean(performance_) df_eval_tmp = pd.DataFrame([[val, x[0], x[1], x[2], x[3], x[4], x[5]]], columns=df_columns) df_eval = df_eval.append(df_eval_tmp) return val # definition of hyperparameter search space: max_depth = OrdinalSpace([2, 100]) n_estimators = OrdinalSpace([1, 1000]) min_samples_leaf = OrdinalSpace([1, 10]) min_samples_split = OrdinalSpace([2, 20]) bootstrap = NominalSpace(['True', 'False']) max_features = NominalSpace(['auto', 'sqrt', 'log2']) search_space = max_depth + n_estimators + bootstrap + max_features + min_samples_leaf + min_samples_split model = RandomForest(levels=search_space.levels) opt = BO(search_space=search_space, obj_fun=obj_func, model=model, max_FEs=max_FEs, DoE_size=DoE_size, n_point=1, n_job=1, minimize=False, verbose=False) if to_optimize: print(f'Hyperparameter optimization with {cv}-folds and {max_FEs} function evaluations') opt.run() best_params_ = df_eval[df_columns[1:]][df_eval['acc'] == df_eval['acc'].max()][:1].to_dict('records') # Training using the best parameters if to_optimize: rf = RandomForestClassifier(n_jobs=-1, **best_params_[0]) else: rf = RandomForestClassifier(n_jobs=-1) rf.fit(train_set, targets) dump(rf, './rf_model.joblib') end = time.time() print(f'----Duration of training is {(end - start) / 60} minutes') return rf, features_list
# Discrete (nominal) variables can be specified as follows: # No lb, ub... a list of categories instead N = DiscreteSpace(["OK", "A", "B", "C", "D", "E", "F", "G"], var_name="nominal") # The whole search space can be constructed: search_space = C + I + N # Bayesian optimization also uses a Surrogate model # For mixed variable type, the random forest is typically used model = RandomForest(levels=search_space.levels) opt = BO( search_space=search_space, obj_fun=obj_fun, model=model, max_FEs=50, DoE_size=3, # the initial DoE size eval_type="dict", acquisition_fun="MGFI", acquisition_par={"t": 2}, n_job=1, # number of processes n_point=1, # number of the candidate solution proposed in each iteration verbose=True, # turn this off, if you prefer no output ) xopt, fopt, stop_dict = opt.run() print("xopt: {}".format(xopt)) print("fopt: {}".format(fopt)) print("stop criteria: {}".format(stop_dict))
space = RealSpace([lb, ub]) * dim thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess( theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=1e-3, noise_estim=True, optimizer="BFGS", wait_iter=3, random_start=dim, likelihood="concentrated", eval_budget=100 * dim, ) opt = BO( search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=20, verbose=True, n_point=1, acquisition_optimization={"optimizer": "OnePlusOne_Cholesky_CMA"}, ) print(opt.run())
x = np.asarray(x) return np.sum(x**2) space = ContinuousSpace([lb, ub]) * dim mean = trend.constant_trend(dim, beta=None) thetaL = 1e-10 * (ub - lb) * np.ones(dim) thetaU = 10 * (ub - lb) * np.ones(dim) theta0 = np.random.rand(dim) * (thetaU - thetaL) + thetaL model = GaussianProcess(theta0=theta0, thetaL=thetaL, thetaU=thetaU, nugget=0, noise_estim=False, optimizer='BFGS', wait_iter=3, random_start=dim, likelihood='concentrated', eval_budget=100 * dim) opt = BO(search_space=space, obj_fun=fitness, model=model, DoE_size=5, max_FEs=50, verbose=True, n_point=1) print(opt.run())