Esempio n. 1
0
def model(X_train, Y_train, X_test, Y_test):

    model = layers.Sequential()
    model.add(layers.Dense(512, input_shape=(784, )))
    model.add(layers.Activation('relu'))
    model.add(layers.Dropout({{hyperopt.uniform(0, 1)}}))
    model.add(layers.Dense({{hyperopt.choice([256, 512, 1024])}}))
    model.add(layers.Activation({{hyperopt.choice(['relu', 'sigmoid'])}}))
    model.add(layers.Dropout({{hyperopt.uniform(0, 1)}}))

    # If we choose 'four', add an additional fourth layer
    if {{hyperopt.choice(['three', 'four'])}} == 'four':
        model.add(layers.Dense(100))
        model.add({{
            hyperopt.choice([layers.Dropout(0.5),
                             layers.Activation('linear')])
        }})
        model.add(layers.Activation('relu'))

    model.add(layers.Dense(10))
    model.add(layers.Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer={{hyperopt.choice(['rmsprop', 'adam', 'sgd'])}},
                  metrics=['accuracy'])

    model.fit(X_train,
              Y_train,
              batch_size={{hyperopt.choice([64, 128])}},
              nb_epoch=1,
              verbose=2,
              validation_data=(X_test, Y_test))
    score, acc = model.evaluate(X_test, Y_test, verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': hyperopt.STATUS_OK, 'model': model}
def train(df, experiment_name, run_name):
    mlflow.set_experiment(experiment_name)

    data = df.toPandas()
    X_train, X_test, y_train, y_test = train_test_split(data.drop(["quality"], axis=1), data[["quality"]].values.ravel(), random_state=42)

    search_space = {
        'n_estimators': hp.uniform('n_estimators', 10, 100),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 20),
        'max_depth': hp.uniform('max_depth', 2, 10),
    }

    spark_trials = SparkTrials(parallelism=4)

    with mlflow.start_run(run_name=run_name):
      fmin(
          fn=evaluate_hyperparams_wrapper(X_train, X_test, y_train, y_test),
          space=search_space,
          algo=tpe.suggest,
          max_evals=10,
          trials=spark_trials,
      )
Esempio n. 3
0
    early_stopping_rounds = 100,
    verbose = 200)

    # prediction
    pred = model.predict_proba(y_val)
    eval_df = prepare_eval_df()

    # scoring model
    score = macro_lrap(eval_df)

    return {'score' : -score, 'status' : hp.STATUS_OK}

# initial hyper parameters space
space = dict()
space['n_estimator'] = hp.quniform('n_estimators', 100, 2000, 1)
space['max_depth'] = hp.uniform('max_depth', 2, 20, 1)
space['learning_rate'] = hp.loguniform('learning_rate', -5, 0)

# trials for logging information
trials = hp.Trials()

# max evaluation round
max_eval = 50

# running optimisation
best = hp.fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = max_evals,
    trials = trials
Esempio n. 4
0
    print("TP = {}".format(TP))
    print("FP = {}".format(FP))
    print("FN = {}".format(FN))

    f1 = 2. * TP / (2. * TP + FP + FN)
    print("F1 : ", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK}


space = {
    'n_estimators': hp.choice('n_estimators', np.arange(200,
                                                        501,
                                                        25,
                                                        dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(15, 20, dtype=int)),
    'max_features': hp.choice('max_features', np.arange(15, 30, dtype=int)),
    'mss': hp.choice('mss', np.arange(2, 40, 1, dtype=int)),
    'cw': hp.uniform('cw', 1, 5),
    'msl': hp.choice('msl', np.arange(1, 11, dtype=int))
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

pprint(hp.space_eval(space, best))
best_pars = hp.space_eval(space, best)
Esempio n. 5
0
mlflow.set_experiment(experiment_name)

raw_data = spark.read.format("csv").option("header", "true").option(
    "sep", ";").load(input_data_path)
features = engineer_features(raw_data)
data = rename_columns(features).toPandas()

X_train, X_test, y_train, y_test = train_test_split(data.drop(["quality"],
                                                              axis=1),
                                                    data[["quality"
                                                          ]].values.ravel(),
                                                    random_state=42)

search_space = {
    "n_estimators": hp.uniform("n_estimators", 10, 500),
    "min_samples_leaf": hp.uniform("min_samples_leaf", 1, 20),
    "max_depth": hp.uniform("max_depth", 2, 10),
}

spark_trials = SparkTrials(parallelism=4)

with mlflow.start_run(run_name=parent_run_name):
    fmin(
        fn=evaluate_hyperparams_wrapper(X_train, X_test, y_train, y_test),
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=spark_trials,
    )
import hyperopt as hp
from hyperopt import hp, fmin, rand, tpe, space_eval


# 定义目标函数
def q(args):
    x, y = args
    return x**2 + y**2


# 定义配置空间
space = [hp.uniform('x', -1, 1), hp.normal('y', -1, 1)]
# 选择一个搜索算法
best = fmin(q, space, algo=tpe.suggest, max_evals=100)
print(best)
print(space_eval(space, best))

import pickle
import time
from hyperopt import STATUS_OK


def objective(x):
    return {'loss': x**2, 'status': STATUS_OK}


best = fmin(objective,
            space=hp.uniform('x', -10, 10),
            algo=tpe.suggest,
            max_evals=100)
print(best)
Esempio n. 7
0
            plt.figure()
            plt.plot(test_AUC_list, label='test_AUC')
            plt.show()

    return -best_AUC


# =========use library "hyperopt" to finetune the hyerparameters==============

from hyperopt import fmin, tpe, hp, partial

batch_list = [32, 64, 128]

for dist in [5.]:
    space = {
        "lr_rate": hp.uniform("lr_rate", 0.0005, 0.01),
        "dp_out": hp.uniform("dp_out", 0.5, 1),
        "bt_size": hp.choice("bt_size", batch_list),
        "distance": hp.choice("distance", [dist])
    }
    # algo = partial(tpe.suggest, n_startup_jobs=10)
    try:
        best = fmin(main, space, algo=tpe.suggest, max_evals=50)
        best["bt_size"] = batch_list[best["bt_size"]]
        best["distance"] = dist
        best_AUC = -main(best)
        with open('finetune.txt', 'a') as f:
            f.write(
                "At distance {}, the best AUC is {}, its lr_rate is {}, drop_out is {}, batch_size is {}\n\n"
                .format(dist, best_AUC, best["lr_rate"], best["dp_out"],
                        best["bt_size"]))
Esempio n. 8
0
    if booster == "gbtree":
        pred_test = model.predict(X_test)
    elif booster == "dart":
        pred_test = model.predict(X_test, ntree_limit = num_round)
        

    error= MSE(y_test,pred_test)
    r2=-r2_score(y_train,model.predict(X_train))
    
    return float(error)


# DEFINING SEARCH SPACE
search_space = {'booster': hp.choice('booster', ['gbtree',"dart"]),
        'n_estimators': hp.quniform('n_estimators', 50, 3000, 1),
        'eta': hp.uniform('eta', 0, 1),
        'gamma': hp.uniform('gamma', 1, 500),
        'max_depth': hp.quniform('max_depth', 3, 100, 1),
        'min_child_weight': hp.uniform('min_child_weight', 0, 100),
        'random_state': sample(scope.int(hp.quniform('random_state', 4, 8, 1))),
        'subsample': hp.uniform('subsample', 0, 1),
        'alpha': hp.uniform('alpha', 1, 8),
        'colsample_bytree': hp.uniform('colsample_bytree', 0, 1),
        'sample_type': hp.choice('sample_type', ['uniform', 'weighted']),
        'normalize_type': hp.choice('normalize_type', ['tree', 'forest']),
        'grow_policy': hp.choice('grow_policy', ['depthwise', 'lossguide']),
        'rate_drop': hp.uniform('rate_drop', 0, 1),
        'skip_drop': hp.uniform('skip_drop', 0, 1),
        'colsample_bylevel':  hp.uniform('colsample_bylevel', 0, 1),
        'colsample_bynode': hp.uniform('colsample_bynode', 0, 1),
        'reg_lambda':  hp.uniform('reg_lambda', 1, 8)} 
Esempio n. 9
0
def lgb_tuning(lgb_cv,N_FOLDS=5,MAX_EVALS=100,output_file='bayes_test.csv',metric='auc',objection='binary',groups=None):
    def objective(hyperparameters,groups=groups):
        # Keep track of evals
        ITERATION =0

        # Using early stopping to find number of trees trained
        if 'n_estimators' in hyperparameters:
            del hyperparameters['n_estimators']

        # Retrieve the subsample
        subsample = hyperparameters['boosting_type'].get('subsample', 1.0)

        # Extract the boosting type and subsample to top level keys
        hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
        hyperparameters['subsample'] = subsample

        # Make sure parameters that need to be integers are integers
        for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples','max_depth']:
            hyperparameters[parameter_name] = int(hyperparameters[parameter_name])
        hyperparameters['objective']=objection
        #hyperparameters['verbose']=-1
        start = timer()
        
        # Perform n_folds cross validation
        if groups:           
            groups=lgb_cv.get_group()
            folds=GroupKFold().split(lgb_cv.get_label(),groups=groups)
        else:
            folds=None
        
        if metric.lower()=='map':
            hyperparameters['eval_at']=1
        
        
        cv_results = lgb.cv(hyperparameters, lgb_cv, num_boost_round = 4000, nfold = N_FOLDS,folds=folds,\
                            early_stopping_rounds=300, metrics = metric)

        run_time = timer() - start
        
        score_key=sorted(cv_results.keys())[0]
        # Extract the best score
        best_score = cv_results[score_key][-1]

        # Loss must be minimized
        if metric=='binary_error':
            loss=best_score
        else:
            loss = 1 - best_score

        # Boosting rounds that returned the highest cv score
        n_estimators = len(cv_results[score_key])

        # Add the number of estimators to the hyperparameters
        hyperparameters['n_estimators'] = n_estimators

        # Write to the csv file ('a' means append)
        of_connection = open(OUT_FILE, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
        of_connection.close()

        # Dictionary with information for evaluation
        return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
                'train_time': run_time, 'status': STATUS_OK}
    
    # Define the search space
    space = {
        'boosting_type': hp.choice('boosting_type', 
                                    [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                    {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                         {'boosting_type': 'goss', 'subsample': 1.0}]),
        'num_leaves': hp.quniform('num_leaves', 20, 200, 4),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.5)),
        #'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
        'min_child_samples': hp.quniform('min_child_samples', 20, 300, 5),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.2),
        'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.2),
        #'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
        'is_unbalance': hp.choice('is_unbalance', [True, False]),
        'max_depth': hp.quniform('max_depth', 4, 8, 1)
        }
       

    # Create the algorithm
    tpe_algorithm = tpe.suggest
    # Record results
    trials = Trials()
    
    
    # Create a file and open a connection
    OUT_FILE = output_file
    of_connection = open(OUT_FILE, 'w')
    writer = csv.writer(of_connection)


    # Write column names
    headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
    writer.writerow(headers)
    of_connection.close()
    #global  ITERATION

    ITERATION = 0
    # Run optimization
    best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
                max_evals = MAX_EVALS)
    return best