예제 #1
0
def test_NominalSpace():
    S = NominalSpace(
        [['OK', 'A', 'B', 'C', 'D', 'E', 'A']] * 2, ['x', 'y']
    )
    assert all(
        set(v) == set(['OK', 'A', 'B', 'C', 'D', 'E']) for k, v in S.levels.items()
    )

    S = NominalSpace([['A'] * 3, 'B', ['x', 'y']])
    assert set(S.levels[2]) == set(['x', 'y'])

    S = NominalSpace(['x', 'y', 'z'])
    assert set(S.levels[0]) == set(['x', 'y', 'z'])
예제 #2
0
def test_mix_space():
    dim_r = 2  # dimension of the real values

    def obj_fun(x):
        x_r = np.array([x['continuous_%d' % i] for i in range(dim_r)])
        x_i = x['ordinal']
        x_d = x['nominal']
        _ = 0 if x_d == 'OK' else 1
        return np.sum(x_r**2) + abs(x_i - 10) / 123. + _ * 2

    search_space = ContinuousSpace([-5, 5], var_name='continuous') * dim_r + \
        OrdinalSpace([5, 15], var_name='ordinal') + \
        NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal')

    model = RandomForest(levels=search_space.levels)

    opt = ParallelBO(
        search_space=search_space,
        obj_fun=obj_fun,
        model=model,
        max_FEs=6,
        DoE_size=3,  # the initial DoE size
        eval_type='dict',
        acquisition_fun='MGFI',
        acquisition_par={'t': 2},
        n_job=3,  # number of processes
        n_point=3,  # number of the candidate solution proposed in each iteration
        verbose=True  # turn this off, if you prefer no output
    )
    xopt, fopt, stop_dict = opt.run()

    print('xopt: {}'.format(xopt))
    print('fopt: {}'.format(fopt))
    print('stop criteria: {}'.format(stop_dict))
예제 #3
0
def test_sampling():
    C = ContinuousSpace([-5, 5]) * 3 
    I = OrdinalSpace([[-100, 100], [-5, 5]], 'heihei')
    N = NominalSpace([['OK', 'A', 'B', 'C', 'D', 'E', 'A']] * 2, ['x', 'y'])

    S = N + I + C
    S.sampling(5, method='LHS')
    S.sampling(5, method='uniform')
예제 #4
0
def test_from_dict():
    a = SearchSpace.from_dict(
        {
            "activation" : 
            {
                "type" : "c",
                "range" : [
                    "elu", "selu", "softplus", "softsign", "relu", "tanh", 
                    "sigmoid", "hard_sigmoid", "linear"
                ],
                "N" : 3
            }
        }
    )
              
    print(a.var_name)
    print(a.sampling(1))

    a = NominalSpace(['aaa'], name='test')
    print(a.sampling(3))
예제 #5
0
def test_warm_data_with_RF():
    space = ContinuousSpace([-10, 10]) * 2 + \
        OrdinalSpace([5, 15]) + \
        NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'])

    X = space.sampling(10)
    y = [obj_fun(x) for x in X]

    model = RandomForest(levels=space.levels)
    opt = BO(search_space=space,
             obj_fun=obj_fun,
             model=model,
             minimize=True,
             eval_type='list',
             max_FEs=10,
             verbose=True,
             acquisition_fun='EI',
             warm_data=(X, y))
    opt.run()
    assert opt.data.shape[0] == 20
예제 #6
0
def test_ProductSpace():
    C = ContinuousSpace([-5, 5], precision=1) * 3  # product of the same space
    I = OrdinalSpace([[-100, 100], [-5, 5]], 'heihei')
    N = NominalSpace([['OK', 'A', 'B', 'C', 'D', 'E', 'A']] * 2, ['x', 'y'])

    space = C + C + C
    print(space.sampling(2))

    # cartesian product of heterogeneous spaces
    space = C + I + N 
    print(space.sampling(10))
    print(space.bounds)
    print(space.var_name)
    print(space.var_type)

    print((C * 2).var_name)
    print((N * 3).sampling(2))

    C = ContinuousSpace([[0, 1]] * 2, var_name='weight')
    print(C.var_name)
def test_MIES():
    def obj_fun(X):
        v = []
        for x in X:
            x_r = np.array([x['real_%d' % i] for i in range(2)])
            x_i = np.array([x['int_%d' % i] for i in range(2)])
            x_d = np.array([x['cat_%d' % i] for i in range(2)])

            v.append(
                np.sum(x_r ** 2) + \
                np.sum(np.abs(x_i - 10) / 123.) + \
                np.sum(x_d == 'OK')
            )
        return v

    search_space = ContinuousSpace([-5, 5], var_name='real') * 2 + \
        OrdinalSpace([5, 15], var_name='int') * 2 + \
        NominalSpace(
            ['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'],
            var_name='cat'
        ) * 2

    opt = MIES(
        search_space=search_space,
        obj_func=obj_fun,
        max_eval=100,
        eval_type='dict',
        verbose=True  # turn this off, if you prefer no output
    )
    xopt, fopt, stop_dict = opt.optimize()

    print('xopt: {}'.format(xopt))
    print('fopt: {}'.format(fopt))
    print('stop criteria: {}'.format(stop_dict))


# def test_multi_acquisition():
# dim_r = 2  # dimension of the real values
# def obj_fun(x):
#     x_r = np.array([x['continuous_%d'%i] for i in range(dim_r)])
#     x_i = x['ordinal']
#     x_d = x['nominal']
#     _ = 0 if x_d == 'OK' else 1
#     return np.sum(x_r ** 2) + abs(x_i - 10) / 123. + _ * 2

# search_space = ContinuousSpace([-5, 5], var_name='continuous') * dim_r + \
#     OrdinalSpace([5, 15], var_name='ordinal') + \
#     NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal')

# model = RandomForest(levels=search_space.levels)

# opt = MultiAcquisitionBO(
#     search_space=search_space,
#     obj_fun=obj_fun,
#     model=model,
#     max_FEs=8,
#     DoE_size=4,    # the initial DoE size
#     eval_type='dict',
#     n_job=4,       # number of processes
#     n_point=4,     # number of the candidate solution proposed in each iteration
#     verbose=True   # turn this off, if you prefer no output
# )

# xopt, fopt, stop_dict = opt.run()
# print('xopt: {}'.format(xopt))
# print('fopt: {}'.format(fopt))
# print('stop criteria: {}'.format(stop_dict))
# for 2 variables, the naming scheme is continuous0, continuous1
C = ContinuousSpace([-5, 5], var_name='continuous') * dim_r

# Equivalently, you can also use
# C = ContinuousSpace([[-5, 5]]] * dim)
# The general usage is:
# ContinuousSpace([[lb_1, ub_1], [lb_2, ub_2], ..., [lb_n, ub_n]])

# Integer (ordinal) variables can be specified as follows:
# The domain of integer variables can be given as with continuous ones
# var_name is optional
I = OrdinalSpace([5, 15], var_name='ordinal')

# Discrete (nominal) variables can be specified as follows:
# No lb, ub... a list of categories instead
N = NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal')

# The whole search space can be constructed:
search_space = C + I + N

# Bayesian optimization also uses a Surrogate model
# For mixed variable type, the random forest is typically used
model = RandomForest(levels=search_space.levels)

opt = ParallelBO(
    search_space=search_space,
    obj_fun=obj_fun,
    model=model,
    max_FEs=100,
    DoE_size=3,  # the initial DoE size
    eval_type='dict',
예제 #9
0
def modeling(train, targets, to_optimize, **kwargs):
    """
    Training and performing hyperparmeter optimization
    by Bayesian Optimization. Currently only supporting
    Random Forests.
    TODO: Make the HO and train_seting more interactive

    :param to_optimize: perform or not HO (boolean)
    :param train: train set (pandas)
    :param targets: targets (labels) (np.arrays)
    :cv: CV count for hyperparameter optimization
    :to_drop: Features to be dropped from learning such as unit numbers,
     cycles, etc (list of string names)
    :DoE_size: Initial design of experiment for the BO HO.
    :max_FEs: maximum number of function evaluations of the BO HO
    :features_list= a list of features to use, cv=
    :return: trained model and list of used features
    """

    start = time.time()
    features_list = kwargs.get('features_list', None)
    to_drop = kwargs.get('to_drop', None)
    cv = kwargs.get('cv', 10)
    DoE_size = kwargs.get('DoE_size', 200)
    max_FEs = kwargs.get('max_FEs', 20)

    train_set = train.copy()
    if to_drop:
        print(f'The following features will not be used in training: {to_drop}')
        train_set.drop(to_drop, axis=1, inplace=True)

    if features_list:
        print('Features selected by user')
        train_set = train_set[features_list]
        train_set = train_set.values

    else:
        print('Feature Selection (this will take a while...)')
        train_set, features_list = boruta_feature_selection(train_set, targets)

        with open('./features_list.pkl', 'wb') as f:
            pkl.dump(features_list, f)

    df_columns = ['acc', 'max_depth', 'n_estimators', 'bootstrap', 'max_features', 'min_samples_leaf',
                  'min_samples_split']

    df_eval = pd.DataFrame(columns=df_columns)

    # Hyperparameter optimization
    # objective function
    def obj_func(x):

        # logger.info('Started internal cross-validation')
        nonlocal df_eval

        performance_ = []

        skf = StratifiedKFold(n_splits=cv, random_state=np.random, shuffle=True)
        for train_set_index, test_index in tqdm(skf.split(train_set, targets), 'Optimizing HO'):
            X_train_set, X_test = train_set[train_set_index], train_set[test_index]
            y_train_set, y_test = targets[train_set_index], targets[test_index]

            rf_ = RandomForestClassifier(n_estimators=int(x[1]), max_depth=int(x[0]), bootstrap=x[2],
                                         max_features=x[3], min_samples_leaf=x[4], min_samples_split=x[5],
                                         n_jobs=-1)

            rf_.fit(X_train_set, y_train_set)

            predictions_ = rf_.predict(X_test)

            performance_.append(accuracy_score(y_test, predictions_))

        val = np.mean(performance_)

        df_eval_tmp = pd.DataFrame([[val, x[0], x[1], x[2], x[3], x[4], x[5]]],
                                   columns=df_columns)
        df_eval = df_eval.append(df_eval_tmp)
        return val

    # definition of hyperparameter search space:
    max_depth = OrdinalSpace([2, 100])
    n_estimators = OrdinalSpace([1, 1000])
    min_samples_leaf = OrdinalSpace([1, 10])
    min_samples_split = OrdinalSpace([2, 20])
    bootstrap = NominalSpace(['True', 'False'])
    max_features = NominalSpace(['auto', 'sqrt', 'log2'])

    search_space = max_depth + n_estimators + bootstrap + max_features + min_samples_leaf + min_samples_split
    model = RandomForest(levels=search_space.levels)

    opt = BO(search_space=search_space, obj_fun=obj_func, model=model, max_FEs=max_FEs,
             DoE_size=DoE_size,
             n_point=1,
             n_job=1,
             minimize=False,
             verbose=False)

    if to_optimize:
        print(f'Hyperparameter optimization with {cv}-folds and {max_FEs} function evaluations')
        opt.run()
    best_params_ = df_eval[df_columns[1:]][df_eval['acc'] == df_eval['acc'].max()][:1].to_dict('records')

    # Training using the best parameters
    if to_optimize:
        rf = RandomForestClassifier(n_jobs=-1, **best_params_[0])
    else:
        rf = RandomForestClassifier(n_jobs=-1)
    rf.fit(train_set, targets)

    dump(rf, './rf_model.joblib')
    end = time.time()

    print(f'----Duration of training is {(end - start) / 60} minutes')

    return rf, features_list