def main():

    usage = "%prog"
    parser = OptionParser(usage=usage)
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_chars',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space
    reuse = options.reuse
    output_dirname = options.output_dirname

    if reuse:
        output_dirname += '_reuse'

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename_wo_ext(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=100,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Beispiel #2
0
def optimize_model_pytorch(device, args, train_GWAS, train_y, test_GWAS, test_y, out_folder ="", startupJobs = 40, maxevals = 200, noOut = False):
    global numTrials_pytorch
    numTrials_pytorch= 0

    trials = Trials()
    trial_wrapper = partial(trial_pytorch,device = device, args = args , train_GWAS = train_GWAS, train_y = train_y , test_GWAS = test_GWAS , test_y = test_y)

    best_pars = fmin(trial_wrapper, parameter_space_pytorch(), algo=partial(tpe.suggest, n_startup_jobs=(startupJobs) ), max_evals=maxevals, trials=trials)

    # Print the selected 'best' hyperparameters.
    if noOut == False: print('\nBest hyperparameter settings: ',space_eval(parameter_space_pytorch(), best_pars),'\n')

    # loops through the 1st entry in the dict that holds all the lookup keys
    regression = True

    for p in trials.trials[0]['misc']['idxs']: plot_optimization_pytorch(trials, p, regression, out_folder = out_folder) 

    best_pars = space_eval(parameter_space_pytorch(), best_pars) # this turns the indices into the actual params into the valid aprameter space
    
    # override the epochs with the early start
    lowestLossIndex = np.argmin(trials.losses())
    trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] = trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] += 1 # as epochs are 0 based otherwise...
    best_pars['epochs'] = best_pars['earlyStopEpochs'] 
    if best_pars['epochs'] <= 0 : best_pars['epochs'] = 1 # we dont want a network without any training, as that will cause a problem for deep dreaming
    return(best_pars)
Beispiel #3
0
def test_landing_screen():

    # define an objective function
    def objective(args):
        case, val = args
        if case == 'case 1':
            return val
        else:
            return val ** 2

    # define a search space
    from hyperopt import hp
    space = hp.choice('a',
        [
            ('case 1', 1 + hp.lognormal('c1', 0, 1)),
            ('case 2', hp.uniform('c2', -10, 10))
        ])

    # minimize the objective over the space
    import hyperopt
    best = hyperopt.fmin(objective, space,
        algo=hyperopt.tpe.suggest,
        max_evals=100)

    print best
    # -> {'a': 1, 'c2': 0.01420615366247227}

    print hyperopt.space_eval(space, best)
Beispiel #4
0
Datei: run.py Projekt: benbo/botc
def main():

    usage = "%prog text.json labels.csv feature_dir output_dir"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='max_iter', default=4,
                      help='Maximum iterations of Bayesian optimization; default=%default')

    (options, args) = parser.parse_args()
    max_iter = int(options.max_iter)

    global data_filename, label_filename, feature_dir, output_dir, log_filename

    data_filename = args[0]
    label_filename = args[1]
    feature_dir = args[2]
    output_dir = args[3]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    log_filename = os.path.join(output_dir, 'log.txt')

    with open(log_filename, 'w') as logfile:
        logfile.write(','.join([data_filename, label_filename, feature_dir, output_dir]))

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Beispiel #5
0
def main(args):
    print ("Made it to the start of main!")
    print ("the time at the start: " + str(time.time()))
    set_globals(args)
    trials = Trials()
    if args["run_bayesopt"]:
        space = space_manager.get_space(num_models, model_types, search_space)
        # DEBUGGING
        profile = cProfile.Profile()
        try:
            profile.enable()
            best = fmin(call_experiment, space=space, algo=tpe.suggest, max_evals=max_iter, trials=trials)
            profile.disable()
        finally:
            profile = pstats.Stats(profile).sort_stats("cumulative")
            profile.print_stats()

        print space_eval(space, best)
        printing_best(trials)
    # loading models from file
    else:
        with open(model_path) as f:
            for i in range(line_num - 1):
                f.readline()

            space = eval(f.readline())
            best = call_experiment(space)
Beispiel #6
0
def test_space_eval():
    space = hp.choice('a',
                      [
                          ('case 1', 1 + hp.lognormal('c1', 0, 1)),
                          ('case 2', hp.uniform('c2', -10, 10))
                      ])

    assert space_eval(space, {'a': 0, 'c1': 1.0}) == ('case 1', 2.0)
    assert space_eval(space, {'a': 1, 'c2': 3.5}) == ('case 2', 3.5)
def main():
    set_globals()
    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)
    
    print space_eval(space, best)
    print "losses:", [-l for l in trials.losses()]
    print('the best loss: ', max([-l for l in trials.losses()]))
    print("number of trials: " + str(len(trials.trials)))
Beispiel #8
0
 def run(self):
     start = time.time()
     trials = Trials()
     best = fmin(self._obj, self.model_param_space._build_space(), tpe.suggest, self.max_evals, trials)
     best_params = space_eval(self.model_param_space._build_space(), best)
     best_params = self.model_param_space._convert_int_param(best_params)
     trial_rmses = np.asarray(trials.losses(), dtype=float)
     best_ind = np.argmin(trial_rmses)
     best_rmse_mean = trial_rmses[best_ind]
     best_rmse_std = trials.trial_attachments(trials.trials[best_ind])["std"]
     self.logger.info("-"*50)
     self.logger.info("Best RMSE")
     self.logger.info("      Mean: %.6f"%best_rmse_mean)
     self.logger.info("      std: %.6f"%best_rmse_std)
     self.logger.info("Best param")
     self.task._print_param_dict(best_params)
     end = time.time()
     _sec = end - start
     _min = int(_sec/60.)
     self.logger.info("Time")
     if _min > 0:
         self.logger.info("      %d mins"%_min)
     else:
         self.logger.info("      %d secs"%_sec)
     self.logger.info("-"*50)
def xgb3(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    N_splits = 9
    N_seeds = 4
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train2, y)
    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=100,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv)-1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)
    space_xgb = dict(
            max_depth = hp.choice('max_depth', range(2, 8)),
            subsample = hp.quniform('subsample', 0.6, 1, 0.05),
            colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
            learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
            min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
            gamma = hp.quniform('gamma', 0, 10, 0.05),
            alpha = hp.quniform('alpha', 0.0, 1, 0.0001),

            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    trs = load_state(cname + '_trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin))
    for n in range(25):
        best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        save_state(cname + '_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)
    xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params)
Beispiel #10
0
def tune(estimator_class, X_train, y_train, space, eval_fun, cv=3, n_jobs=1, algo=tpe.suggest, max_evals=100):
    if (type(cv) == int):
        cv = KFold(len(X_train), n_folds=cv)

    def hp_eval(params):
        estimator = estimator_class(**params)
        return cross_val(estimator, X_train, y_train, cv, eval_fun, n_jobs)

    best = fmin(hp_eval, space, algo=algo, max_evals=max_evals)
    best_params = space_eval(space, best)
    return best_params
Beispiel #11
0
def eval_hyperopt_space(space, vals):
    """
    Evaluate a set of parameter values within the hyperopt space.
    Optionally unpacks the values, if they are wrapped in lists.
    :param space: dict
        the hyperopt space dictionary
    :param vals: dict
        the values from a hyperopt trial
    :return: evaluated space
    """
    unpacked_vals = unpack_hyperopt_vals(vals)
    return space_eval(space, unpacked_vals)
Beispiel #12
0
def main(args):
    print("Made it to the start of main!")
    set_globals(args)
    trials = Trials()
    if args['run_bayesopt']:
        space = space_manager.get_space(num_models, model_types, search_space)
        best = fmin(call_experiment,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_iter,
                    trials=trials)
        print space_eval(space, best)
        printing_best(trials)
    #loading models from file
    else:
        with open(model_path) as f:
            for i in range(line_num - 1):
                f.readline()

            space = eval(f.readline())
            best = call_experiment(space)
def list_best(mongo, exp_key=None, space=None):
    mongo_trials = MongoTrials(mongo, exp_key=exp_key)

    jobs = mongo_trials.trials
    jobs_ok = [ (d['result']['loss'], d)  for d in jobs if d['state'] == 2 and d['result']['status'] == 'ok']
    jobs_ok.sort()

    for loss, job in reversed(jobs_ok):
        print(loss, job['owner'], job['result'])
        spec = spec_from_misc(job['misc'])
        print("spec:  {}".format(spec))
        if space is not None:
            print("space: {}".format(space_eval(space, spec)))
    print("total: {}/{}".format(len(jobs_ok), len(jobs)))
    return mongo_trials.argmin
Beispiel #14
0
 def run(self,max_evals=100):
     trials = Trials()
     best = fmin(self.call_experiment,
                 space=space,
                 algo=tpe.suggest,
                 max_evals=max_evals,
                 trials=trials)
     #print best
     args = space_eval(space, best)
     #print "losses:", [-l for l in trials.losses()]
     #print max([-l for l in trials.losses()])
     
     #TODO
     #return model and featurizer. Don't forget to set ascii option.
     featurizer = Featurizer(args)
     if args['text']['text'] == 'ascii':
         featurizer.set_ascii(True)
     return self.get_model(args),featurizer 
Beispiel #15
0
            '*****************************************************************************************'
        )
        for k in range(n_folds):
            print('Fold ' + str(k + 1))
            space['verbose'] = True
            root_mean_squared_error_k, mape_k, best_params_k = evaluate_on_test_set(
                space, results[k], test[k], objective, k, n_folds)
            root_mean_squared_error.append(root_mean_squared_error_k)
            mape.append(mape_k)
            best.append(best_params_k)

        ################################################################################################################
        # Save gait specific metrics
        df_list = []
        for k in range(n_folds):
            params = space_eval(space, results[k])
            params['sample_ids'] = test[k]
            params['metric'] = 'get_res'
            params['verbose'] = False
            df = objective(params)
            df_list.append(df)
        df_for_task_j = pd.concat(df_list)
        df_gait_measures_by_task.append(df_for_task_j)

        ################################################################################################################
        # Save results
        save_results = dict()
        save_results['best'] = best
        save_results['rmse'] = root_mean_squared_error
        save_results['mape'] = mape
        save_results['train'] = train
    'max_depth': hp.choice("max_depth", range(1, 10, 1)),
    'min_child_weight': hp.choice('min_child_weight', range(150, 171)),
    'subsample': hp.uniform('subsample', 0.7, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
    'gamma': hp.uniform('gamma', 0.1, 0.5),
    'reg_alpha': hp.uniform('reg_alpha', 5e-5, 5e-4)
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=500,
            trials=trials)

best_params = space_eval(space, best)
print(best_params)
# Test on the test set
NUM_TRIALS = int(np.ceil(200000 / train.shape[0]))
print('Test {} times on the test set'.format(NUM_TRIALS))
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(**best_params,
                        learning_rate=0.01,
                        objective='binary:logistic',
                        n_jobs=-1,
                        scale_pos_weight=1,
                        seed=i)
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
Beispiel #17
0
def minimize(model,
             data,
             algo,
             max_evals,
             trials,
             functions=None,
             rseed=1337,
             notebook_name=None,
             verbose=True,
             eval_space=False,
             return_space=False):
    """
    Minimize a keras model for given data and implicit hyperparameters.

    Parameters
    ----------
    model: A function defining a keras model with hyperas templates, which returns a
        valid hyperopt results dictionary, e.g.
        return {'loss': -acc, 'status': STATUS_OK}
    data: A parameter-less function that defines and return all data needed in the above
        model definition.
    algo: A hyperopt algorithm, like tpe.suggest or rand.suggest
    max_evals: Maximum number of optimization runs
    trials: A hyperopt trials object, used to store intermediate results for all
        optimization runs
    rseed: Integer random seed for experiments
    notebook_name: If running from an ipython notebook, provide filename (not path)
    verbose: Print verbose output
    eval_space: Evaluate the best run in the search space such that 'choice's contain actually meaningful values instead
                of mere indices
    return_space: Return the hyperopt search space object (e.g. for further processing) as last return value

    Returns
    -------
    If `return_space` is False: A pair consisting of the results dictionary of the best run and the corresponding
    keras model.
    If `return_space` is True: The pair of best result and correspondeing keras model, and the hyperopt search space
    """
    best_run, space = base_minimizer(model=model,
                                     data=data,
                                     functions=functions,
                                     algo=algo,
                                     max_evals=max_evals,
                                     trials=trials,
                                     rseed=rseed,
                                     full_model_string=None,
                                     notebook_name=notebook_name,
                                     verbose=verbose)

    best_model = None
    for trial in trials:
        vals = trial.get('misc').get('vals')
        for key in vals.keys():
            vals[key] = vals[key][0]
        if trial.get('misc').get('vals') == best_run and 'model' in trial.get('result').keys():
            best_model = trial.get('result').get('model')

    if eval_space is True:
        # evaluate the search space
        best_run = space_eval(space, best_run)

    if return_space is True:
        # return the space as well
        return best_run, best_model, space
    else:
        # the default case for backwards compatibility with expanded return arguments
        return best_run, best_model
    return -np.mean(
        cross_val_score(model, words, train_y, cv=3, scoring="f1_macro"))


# Define a search space - Logistic regression so lets search for a value of C and which penalty to use.
# For full list of configuration options see http://hyperopt.github.io/hyperopt/getting-started/search_spaces/
space = {
    "C": hp.uniform("C", 0, 10),
    "penalty": hp.choice("penalty", ["l2", "none"]),
    "word_rep": hp.choice("word_rep", ["tfidf", "tf", "bow"]),
}

# minimize the objective over the space
best = fmin(objective, space, algo=tpe.suggest, max_evals=5000)
print(hyperopt.space_eval(space, best))

# %%
# Create model with best hyperparameters seen above. Need to manually select test representation

args = hyperopt.space_eval(space, best)
train_words = word_reps[args.pop("word_rep")]
test_words = test_tfidf

model = model_type(**args)

# Fit to training data
model.fit(train_words, train_y)

# Print metrics
print(
Beispiel #19
0
    def fit(self, features, labels):
        X_train, X_test, y_train, y_test = train_test_split(features,
                                                            labels,
                                                            test_size=0.2,
                                                            random_state=42)
        plt.figure(figsize=(18, 7 * len(self.classifiers)))
        i = 1
        for classifier in self.classifiers:
            instance = clone(self.models[classifier])
            self.results.append({
                'Name':
                classifier,
                'Params':
                instance.get_params(),
                'Score':
                cross_val_score(instance,
                                features,
                                labels,
                                cv=4,
                                scoring='roc_auc').mean()
            })
            if self.search:
                instance = clone(self.models[classifier])
                objective = partial(self.objective,
                                    model=instance,
                                    features=features,
                                    labels=labels)
                trials = Trials()
                best = fmin(fn=objective,
                            space=self.space[classifier],
                            trials=trials,
                            algo=tpe.suggest,
                            max_evals=self.max_evals)
                instance.set_params(**self.fixQuniform(
                    space_eval(self.space[classifier], best)))
                self.results.append({
                    'Name':
                    classifier + '_Tuned',
                    'Params':
                    instance.get_params(),
                    'Score':
                    -trials.best_trial['result']['loss'],
                })
            instance.fit(X_train, y_train)
            predict = [i[1] for i in instance.predict_proba(X_test)]
            fpr, tpr, _ = roc_curve(y_test, predict)
            precision, recall, _ = precision_recall_curve(y_test, predict)

            plt.subplot(len(self.classifiers), 2, i).plot(fpr, tpr)
            plt.plot([0, 1], [0, 1], ls='--')
            plt.title(classifier + ' Receiver Operating Characteristic Curve')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            i += 1

            plt.subplot(len(self.classifiers), 2, i).plot(recall, precision)
            plt.title(classifier + ' Precision Recall Curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            i += 1

            try:
                features_temp = pd.DataFrame({
                    classifier:
                    features.columns,
                    'importance':
                    instance.coef_[0]
                    if classifier == 'LR' else instance.feature_importances_
                })
                features_temp = features_temp.sort_values(
                    'importance',
                    ascending=False).head(min(20, len(features_temp)))
                features_temp = features_temp.reset_index(drop=True)
                self.feature_importance.append(features_temp)
            except:
                print('no feature name exist')
        plt.savefig(self.imgbuffer, format='png')
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train, y)
    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv)-1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)
    space_xgb = dict(
            max_depth = hp.choice('max_depth', range(2, 8)),
            subsample = hp.quniform('subsample', 0.6, 1, 0.05),
            colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
            learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
            min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
            gamma = hp.quniform('gamma', 0.5, 10, 0.05),

            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    trs = state.load('xgb_trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        state.save('xgb_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)

    N_splits = 9
    N_seeds = 1

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test)
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            dtrain = xgb.DMatrix(train.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += clf.predict(dtest)
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, state.now())
            scores.append(score)
        z[cname2] /= N_splits

    cv = scores
    z['y'] = z[cname2]
    print('validation loss: ', cv, np.mean(cv), np.std(cv))

    return cv, None
def main():

    usage = "%prog <DRLD|MOLD|MIP|Primary|General|PK-...>"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='basic',
                      help='Model: (basic|GRU|LSTM); default=%default')
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_mod',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')
    parser.add_option('--mod', dest='mod', action="store_true", default=False,
                      help='Use modifications; default=%default')


    (options, args) = parser.parse_args()


    global output_dirname, output_filename, reuse, search_alpha, space, mod, dataset
    reuse = options.reuse
    output_dirname = options.output_dirname
    model = options.model
    mod = options.mod

    dataset = args[0]

    if model == 'basic':
        space['arch']['unit'] = 'basic'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 200, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -4, -1),
    elif model == 'GRU':
        space['arch']['unit'] = 'GRU'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 150, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    elif model == 'LSTM':
        space['arch']['unit'] = 'LSTM'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 100, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    else:
        sys.exit('Model not supported!')

    output_dirname += '_' + model

    if reuse:
        output_dirname += '_reuse'

    if mod:
        output_dirname += '_mod'

    output_dirname += '_' + dataset

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=60,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
                                             }
                                        ])

        score_key = 'score'

<<<<<<< HEAD

        for i, df in enumerate(self.dfs):
            df.drop(self.colTypes['Identity'], axis=1, inplace=True) #Dropping Identity cols
            self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(df.drop(self.y, axis=1), df[self.y], test_size=self.test_size)
            trials = Trials()
=======
        df=self.df.copy()
>>>>>>> 7db86ccf9079027446285fcbf67bdc0a735f8a71

        df.drop(self.colTypes['Identity'], axis=1, inplace=True) #Dropping Identity cols

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(df.drop(self.y, axis=1), df[self.y], test_size=self.test_size)
        trials = Trials()

        hyperparam = space_eval(self.space,
                                     fmin(self.objective_func, self.space, trials=trials, algo=tpe.suggest, max_evals=100))
        score = -min(trials.losses())

        self.final_results['Hyperparameter']=hyperparam
        self.final_results[score_key]=score
        self.final_results['Data']=df

    def return_results(self):
        return self.final_results
Beispiel #23
0
    if args.mode == "train":  # Just train with given parameters
        res = optimize(args.lr, args.clip)
    else:
        assert (args.mode == "hyper")
        space = [
            hp.loguniform('lr', -12, -2),
            hp.choice('clip_decision', [-1, hp.uniform('clip', 0.01, 1.0)])
        ]

        best = fmin(fn=lambda args: optimize(args[0], args[1]),
                    space=space,
                    algo=tpe.suggest,
                    max_evals=args.hyper_iter)

        print("Train model with best settings...")
        best = hyperopt.space_eval(space, best)
        res = optimize(best[0], best[1])
        print("Reached validation loss of " + str(res["loss"]) +
              " with parameters:")
        print(best)

    # Load the best saved model.
    if res["status"] != "fail":
        with open(res["model_name"], 'rb') as f:
            model = torch.load(f)

        # Run on train data
        train_loss, train_acc = evaluate(train_x, train_y)
        print('-' * 89)
        print('| End of training | train loss {:5.2f} | train acc {:8.2f}'.
              format(train_loss, train_acc))
Beispiel #24
0
def run(train, y, test, v, z):
    np.random.seed(1)
    #cname = sys._getframe().f_code.co_name
    train = train.values
    test = test.values

    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    space_stack = hp.choice('stacking by', [
        dict(type='BayesianRidge'),
        dict(type='Lars'),
        dict(type='LinearRegression'),
        dict(type='Ridge'),
        dict(type='SGDRegressor', random_state=1),
        dict(
            type='XGBRegressor',
            max_depth=hp.choice('max_depth', range(2, 8)),
            subsample=hp.quniform('subsample', 0.6, 1, 0.05),
            colsample_bytree=hp.quniform('colsample_bytree', 0.6, 1, 0.05),
            learning_rate=hp.quniform('learning_rate', 0.005, 0.03, 0.005),
            min_child_weight=hp.quniform('min_child_weight', 1, 6, 1),
            gamma=hp.quniform('gamma', 0, 10, 0.05),
            reg_alpha=hp.quniform('alpha', 0, 1, 0.0001),
        ),
    ])

    def get_lr(params):
        t = params['type']
        del params['type']

        if t == 'BayesianRidge':
            lr = linear_model.BayesianRidge(**params)
        elif t == 'Lars':
            lr = linear_model.Lars(**params)
        elif t == 'LinearRegression':
            lr = linear_model.LinearRegression(**params)
        elif t == 'Ridge':
            lr = linear_model.Ridge(**params)
        elif t == 'SGDRegressor':
            lr = linear_model.SGDRegressor(**params)
        elif t == 'XGBRegressor':
            lr = xgb.XGBRegressor(**params)

        return lr

    def step(params):
        print(params, end=' ')
        cv = model_selection.cross_val_score(get_lr(params),
                                             train,
                                             y,
                                             cv=10,
                                             scoring=metrics.make_scorer(
                                                 metrics.log_loss))
        score = np.mean(cv)
        print(score)
        return dict(loss=score, status=STATUS_OK)

    trs = state.load('trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        best = tr.argmin
        print('reusing %d trials, best was:' % (len(tr.trials)),
              space_eval(space_stack, best))
    mt = max(50, len(tr.trials) + 1)
    while len(tr.trials) < min(50, mt):
        best = fmin(step,
                    space_stack,
                    algo=tpe.suggest,
                    max_evals=len(tr.trials) + 1,
                    trials=tr)
        state.save('trials', (tr, space_stack))
    params = space_eval(space_stack, best)

    print('best params:', params)
    lr = get_lr(params)
    cv = model_selection.cross_val_score(lr,
                                         train,
                                         y,
                                         cv=10,
                                         scoring=metrics.make_scorer(
                                             metrics.log_loss))
    lr.fit(train, y)
    z['p'] = np.clip(lr.predict(test), 1e-5, 1 - 1e-5)
    z['y'] = z['p']
    v['p'] = model_selection.cross_val_predict(lr, train, y, cv=10)
    print('cv:', np.mean(cv), np.std(cv))
    return cv, None
Beispiel #25
0
    The objective function gets curried and passed to the trials object with the data loaded
    :param multiplier: float uses to scale the objective function
    :return: objective funciton
    """
    def objective_curried(args):
        case, val = args
        log.debug("Arguments parsed {}".format(args))
        if case == 'case 1':
            return val
        else:
            return val**multiplier

    return objective_curried


object_curried = objective_currier(multiplier)
log.info("Starting Optimisation")
# Run optimisation
best = fmin(fn=object_curried,
            space=space,
            algo=tpe.suggest,
            max_evals=num_eval_steps,
            trials=trials,
            verbose=1)

log.info("Finished Optimisation")
# Get the values of the best space
best_space = space_eval(space, best)

log.info("Best value in optimisation is : " + str(best_space))
Beispiel #26
0
    space_dnf = (hp.uniform('tau_dt', 0.0001,
                            1), hp.uniform('h', -1,
                                           0), hp.uniform('gi', 0, 10),
                 hp.uniform('excitation_amplitude', 0.0001,
                            5), hp.uniform('excitation_sigma', 0.0001, 1))
    best = fmin(evaluate,
                space_dnf,
                algo=tpe.suggest,
                trials=tpe_trials,
                max_evals=700)

    full_results = pd.DataFrame({
        'loss': [x['loss'] for x in tpe_trials.results],
        'iteration':
        tpe_trials.idxs_vals[0]['tau_dt'],
        'tau_dt':
        tpe_trials.idxs_vals[1]['tau_dt'],
        'h':
        tpe_trials.idxs_vals[1]['h'],
        'gi':
        tpe_trials.idxs_vals[1]['gi'],
        'excitation_amplitude':
        tpe_trials.idxs_vals[1]['excitation_amplitude'],
        'excitation_sigma':
        tpe_trials.idxs_vals[1]['excitation_sigma']
    })
    full_results.to_csv(os.path.join(output_path, "dnf_cdnet.csv"))

    print(best)
    print(space_eval(space_dnf, best))
Beispiel #27
0
models_to_test = ['lstm', 'dense', 'embedding', 'bidi']
"""datasets_helper.next_dataset()
space = create_base_params('lstm',datasets_helper)
smpl = sample(space)
print(sample(space))"""
for model in models_to_test:
    while datasets_helper.next_dataset():
        space = create_base_params(model, datasets_helper, results_saver)
        best = fmin(optimize_model,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=30,
                    max_queue_len=1,
                    verbose=False)
        results_saver.add_log(
            'Best params for network type {} and dataset {} are: {}\n{}'.
            format(model, datasets_helper.get_dataset_name(), best,
                   space_eval(space, best)))
        results_saver.write_any('best_params', [
            model,
            datasets_helper.get_dataset_name(),
            space_eval(space, best)
        ], 'a')
        #results_saver.write_2D_list([[model,datasets_helper.get_dataset_name(),best]],'best_params','a')
    datasets_helper.reset_dataset_counter()
"""best_run, best_model = optim.minimize(model=test,
                                          data=[],
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())"""
Beispiel #28
0
    #name = model + '_' + regularizer + '_' + '_'.join(feature_list) + '_' + str(hyperparams[0])
    name = model + '_' + regularizer + '_' + '_'.join(feature_list)
    datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes']
    print name
    return experiment.run_group_experiment(name, datasets, 0, feature_list,
                                               model_type=model, regularizer=regularizer)


trials = Trials()
best = fmin(call_experiment,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

#rseed=np.random.randint(1, 4294967295)
#print best
print space_eval(space, best)
print trials.losses()
#for trial in trials.trials:
#    print trial



#run_group_experiment('profile_test', ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'],
#                                    0, ['ngrams'], model_type='SVM')



#experiment_new.main()
Beispiel #29
0
def test_space_eval():
    space = hp.choice("a", [("case 1", 1 + hp.lognormal("c1", 0, 1)), ("case 2", hp.uniform("c2", -10, 10))])

    assert space_eval(space, {"a": 0, "c1": 1.0}) == ("case 1", 2.0)
    assert space_eval(space, {"a": 1, "c2": 3.5}) == ("case 2", 3.5)
Beispiel #30
0
    def optimize(self, input_data=None):
        X, Y = input_data
        # X = X.reshape(X.shape[0], X.shape[1], 1)
        in_shape = X.shape
        space = self.architecture.get_space()

        def get_calls(partition):
            from keras import callbacks as C
            calls = list()
            # calls.append( C.ModelCheckpoint(save_dir +'/'+'-partition_{}'.format(partition)+'-epoch_{epoch:02d}-weights.h5', save_best_only=True, save_weights_only=True, verbose=1) )
            # calls.append( C.CSVLogger(args.save_dir + '/log.csv') )
            # calls.append( C.TensorBoard(log_dir=args.save_dir + '/tensorboard-logs/{}'.format(actual_partition), batch_size=args.batch_size, histogram_freq=args.debug) )
            calls.append( C.EarlyStopping(monitor='val_loss', patience=5, verbose=0) )
            # calls.append( C.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.0001, verbose=0) )
            calls.append( C.LearningRateScheduler(schedule=lambda epoch: 0.001 * (0.9 ** epoch)) )
        #    calls.append( C.LearningRateScheduler(schedule=lambda epoch: 0.001 * np.exp(-epoch / 10.)) )
            # calls.append( CyclicLR(mode='exp_range', gamma=0.99994) )
            # calls.append( CyclicLR(mode='triangular2') )
            calls.append( ModelCheckpoint('best-weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True) )

            return calls



        def f(hp_params):
            try:
                nsplits = 2
                cnt1 = 0
                scores, score = [], None
                seed = 123

                if len(trials.trials)>1:
                    for x in trials.trials[:-1]:
                        space_point_index = dict([(key,value[0]) for key,value in x['misc']['vals'].items() if len(value)>0])
                        peval = space_eval(space,space_point_index)
                        if hp_params == peval:
                            # print('>>> Repeated Evaluation')
                            loss = x['result']['loss']
                            return {'loss':loss, 'status':STATUS_FAIL}

                self.evaluations += 1
                print('Testing parameters (eval {})'.format(self.evaluations))
                print(hp_params)

                # Folds for train - test (Evaluate Model)
                folds1 = StratifiedShuffleSplit(n_splits=nsplits, random_state=seed)
                for train_index, test_index in folds1.split(X, Y):
                    X_train, X_test = X[train_index], X[test_index]
                    Y_train, Y_test = Y[train_index], Y[test_index]
                    
                    cnt1 += 1

                    # Folds for train - validation (Guide training phase)
                    folds2 = StratifiedShuffleSplit(n_splits=1, random_state=17, test_size=0.05)
                    for t_index, v_index in folds2.split(X_train, Y_train):
                        x_train, x_val = X_train[t_index], X_train[v_index]
                        y_train, y_val = Y_train[t_index], Y_train[v_index]
                        # val_data=(x_val, y_val)
                        val_data=([x_val, y_val], [y_val, x_val])

                        lam_recon = 0.392
                        # lam_recon = 0.0001

                        # print('YYYYY', Y_train.shape)

                        model = self.architecture.define_architecture(in_shape, hp_params)
                        # model = Model([in_layer], [out_layer])
                        losses = [margin_loss, 'binary_crossentropy']
                        model.compile(optimizer=optimizers.Adam(lr=0.001), loss=losses, loss_weights=[1., lam_recon])
                        # if cnt1 == 1:
                        #     model.summary()

                        # Update partition couter
                        print('Partition', cnt1)

                        # Get Updated Callbacks
                        calls = get_calls(cnt1)


                        _X = [X_train, Y_train]
                        _Y = [Y_train, X_train]

                        # Train model
                        model.fit(_X, _Y, epochs=200, batch_size=128, verbose=1, callbacks=calls, validation_data=val_data)

                        _X = [X_test, Y_test]
                        _Y = [Y_test, X_test]

                        model.load_weights('best-weights.h5')

                        stats = self.eval(model, (_X, _Y))
                        score = -stats.Mcc
                        scores.append(score)
                        print('score', score)

                        K.clear_session()
                        del model

    #            for t_index, v_index in folds.split(X, Y):
    #                X_train, X_val = X[t_index], X[v_index]
    #                Y_train, Y_val = Y[t_index], Y[v_index]      
    #                val_data=(X_val, Y_val)
    #
    #                calls = get_calls()
    #
    #                in_layer, out_layer = self.architecture.define_architecture(in_shape, hp_params)
    #                model = Model([in_layer], [out_layer])
    #                model.compile(optimizer='adam', loss='binary_crossentropy')
    #                print(model.summary)
    #                
    #                model.fit(X_train, Y_train, epochs=5, batch_size=32, verbose=1,callbacks=calls, validation_data=val_data)
    #
    #                stats = self.eval(model, (X_train,Y_train))
    #                score = stats.Mcc
    #                scores.append(score)
    #
    #                K.clear_session()
    #                del model

                final_score = np.mean(scores)
                print('scores', scores)
                print('final_score', final_score)
                print("")
                return {'loss':final_score, 'status':STATUS_OK}
            except Exception as e:
                print(e)
                return {'loss':None, 'status':STATUS_FAIL}

        trials = Trials()

        best = fmin(f, space, algo=tpe.suggest, trials=trials, max_evals=200)
        best_params = space_eval(space, best)

        return best, best_params
Beispiel #31
0
    def model(self):
        #cname = sys._getframe().f_code.co_name
        cname = 'p'
        train, y, test = self.train_, self.y_, self.test_
        train.drop('id', axis=1, inplace=True)
        test.drop('id', axis=1, inplace=True)
        from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
        def step_et(params):
            clf = ensemble.ExtraTreesRegressor(**params)
            cv = model_selection.cross_val_score(clf,
                                                 train, y,
                                                 scoring=metrics.make_scorer(metrics.log_loss),
                                                 cv = 5,
                                                 n_jobs = -2)
            score = np.mean(cv)
            print(cname, score, params)
            return dict(loss=score, status=STATUS_OK)
        space_et = dict(
            n_estimators = hp.choice('n_estimators', range(50, 1500)),
            min_samples_split = hp.choice('min_samples_split', range(2, 10)),
            min_samples_leaf = hp.choice('min_samples_leaf', range(1, 10)),
            max_features = hp.choice('max_features', range(4, 16)),
            random_state = 1
            )
        trs = self.load('et_trials')
        if trs == None or self.debug_:
            tr = Trials()
        else:
            tr, _ = trs
        if len(tr.trials) > 0:
            print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_et, tr.argmin))
            best = tr.argmin
        while len(tr.trials) < 30:
            best = fmin(step_et, space_et, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
            self.save('et_trials', (tr, space_et))
        et_params = space_eval(space_et, best)
        print(et_params)

        N_splits = 9
        N_seeds = 3

        v, z = self.v_, self.z_
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
        cv = []
        for s in range(N_seeds):
            scores = []
            cname2 = cname + str(s)
            v[cname2], z[cname2] = 0, 0
            et_params['random_state'] = s + 4242
            for n, (itrain, ival) in enumerate(skf.split(train, y)):
                clf = ensemble.ExtraTreesRegressor(**et_params)
                clf.fit(train.ix[itrain], y[itrain])
                p = clf.predict(train.ix[ival])
                v.loc[ival, cname2] += p
                score = metrics.log_loss(y[ival], p)
                z[cname2]  += clf.predict(test)
                print(cname, 'seed %d step %d of %d: '%(et_params['random_state'], n+1, skf.n_splits), score, self.now())
                scores.append(score)
            z[cname2] /= N_splits
            cv.append(np.mean(scores))
            print('seed %d loss: '%(et_params['random_state']), scores, np.mean(scores), np.std(scores))
            z['y'] = z[cname2]

        print('cv:', cv, np.mean(cv), np.std(cv))
        return cv, None
Beispiel #32
0
        def f(hp_params):
            try:
                nsplits = 2
                cnt1 = 0
                scores, score = [], None
                seed = 123

                if len(trials.trials)>1:
                    for x in trials.trials[:-1]:
                        space_point_index = dict([(key,value[0]) for key,value in x['misc']['vals'].items() if len(value)>0])
                        peval = space_eval(space,space_point_index)
                        if hp_params == peval:
                            # print('>>> Repeated Evaluation')
                            loss = x['result']['loss']
                            return {'loss':loss, 'status':STATUS_FAIL}

                self.evaluations += 1
                print('Testing parameters (eval {})'.format(self.evaluations))
                print(hp_params)

                # Folds for train - test (Evaluate Model)
                folds1 = StratifiedShuffleSplit(n_splits=nsplits, random_state=seed)
                for train_index, test_index in folds1.split(X, Y):
                    X_train, X_test = X[train_index], X[test_index]
                    Y_train, Y_test = Y[train_index], Y[test_index]
                    
                    cnt1 += 1

                    # Folds for train - validation (Guide training phase)
                    folds2 = StratifiedShuffleSplit(n_splits=1, random_state=17, test_size=0.05)
                    for t_index, v_index in folds2.split(X_train, Y_train):
                        x_train, x_val = X_train[t_index], X_train[v_index]
                        y_train, y_val = Y_train[t_index], Y_train[v_index]
                        # val_data=(x_val, y_val)
                        val_data=([x_val, y_val], [y_val, x_val])

                        lam_recon = 0.392
                        # lam_recon = 0.0001

                        # print('YYYYY', Y_train.shape)

                        model = self.architecture.define_architecture(in_shape, hp_params)
                        # model = Model([in_layer], [out_layer])
                        losses = [margin_loss, 'binary_crossentropy']
                        model.compile(optimizer=optimizers.Adam(lr=0.001), loss=losses, loss_weights=[1., lam_recon])
                        # if cnt1 == 1:
                        #     model.summary()

                        # Update partition couter
                        print('Partition', cnt1)

                        # Get Updated Callbacks
                        calls = get_calls(cnt1)


                        _X = [X_train, Y_train]
                        _Y = [Y_train, X_train]

                        # Train model
                        model.fit(_X, _Y, epochs=200, batch_size=128, verbose=1, callbacks=calls, validation_data=val_data)

                        _X = [X_test, Y_test]
                        _Y = [Y_test, X_test]

                        model.load_weights('best-weights.h5')

                        stats = self.eval(model, (_X, _Y))
                        score = -stats.Mcc
                        scores.append(score)
                        print('score', score)

                        K.clear_session()
                        del model

    #            for t_index, v_index in folds.split(X, Y):
    #                X_train, X_val = X[t_index], X[v_index]
    #                Y_train, Y_val = Y[t_index], Y[v_index]      
    #                val_data=(X_val, Y_val)
    #
    #                calls = get_calls()
    #
    #                in_layer, out_layer = self.architecture.define_architecture(in_shape, hp_params)
    #                model = Model([in_layer], [out_layer])
    #                model.compile(optimizer='adam', loss='binary_crossentropy')
    #                print(model.summary)
    #                
    #                model.fit(X_train, Y_train, epochs=5, batch_size=32, verbose=1,callbacks=calls, validation_data=val_data)
    #
    #                stats = self.eval(model, (X_train,Y_train))
    #                score = stats.Mcc
    #                scores.append(score)
    #
    #                K.clear_session()
    #                del model

                final_score = np.mean(scores)
                print('scores', scores)
                print('final_score', final_score)
                print("")
                return {'loss':final_score, 'status':STATUS_OK}
            except Exception as e:
                print(e)
                return {'loss':None, 'status':STATUS_FAIL}
        max_jobs = args.evals  #= inf
        poll_interval = 300  #= 5 min
        reserve_timeout = 3600  #= 1 hour
        workdir = None
    sys.argv[0] = args.worker_helper
    sys.exit(main_worker_helper(Options(), None))

elif args.action == 'optimizer':
    # run distributed optimizer
    trials = MongoTrials(args.mongo, exp_key=args.exp_key)
    best = fmin(optimize_exec.objective, space, trials=trials, algo=tpe.suggest, max_evals=args.evals)

    # summary
    print
    print "evals: {}".format(args.evals)
    print "best:  {}".format(best)
    print "space: {}".format(space_eval(space, best))

elif args.action == 'list_best':
    # list distributed evaluation results
    best = list_best(args.mongo, exp_key=args.exp_key, space=space)

    # summary
    print
    print "evals: {}".format(args.evals)
    print "best:  {}".format(best)
    print "space: {}".format(space_eval(space, best))

else:
    raise Exception("Invalid action '{}'".format(args.action))
Beispiel #34
0
    TP = CM[1][1]
    FP = CM[0][1]
    print("TP = {}".format(TP))
    print("FP = {}".format(FP))
    print("FN = {}".format(FN))

    f1 = 2. * TP / (2. * TP + FP + FN)
    print("F1 : ", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK}


space = {
    'n_estimators': hp.choice('n_estimators',
                              np.arange(300, 1600, 100, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(10, 30, dtype=int)),
    'max_features': hp.choice('max_features', np.arange(10, 50, dtype=int)),
    'mss': hp.choice('mss', np.arange(2, 100, 2, dtype=int)),
    'cw': hp.uniform('cw', 1, 30),
    'msl': hp.choice('msl', np.arange(1, 30, dtype=int))
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

pprint(hyperopt.space_eval(space, best))
best_pars = hyperopt.space_eval(space, best)
Beispiel #35
0
    def optimize(self):
        assert self.opt.evaluation_on_learning, \
            'evaluation must be set to be true to do hyperparameter optimization.'
        if self.opt.optimize.loss.startswith("val"):
            assert self.opt.validation, \
                'validation option must be set to be true to do hyperparameter optimization with validation results.'

        opt = self.opt.optimize
        iters, max_trials = 0, opt.get('max_trials', -1)
        space = self._get_space(opt.space)
        with log.ProgressBar(log.INFO, desc='optimizing... ',
                             total=None if max_trials == -1 else max_trials,
                             mininterval=30) as pbar:
            tb_opt = None
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            if opt.start_with_default_parameters:
                with log.supress_log_level(log.WARN):
                    loss = self._optimize({})
                self.logger.info(f'Starting with default parameter result: {loss}')
                self._optimization_info['best'] = loss
                if opt.deployment:
                    self.logger.info('Saving model... to {}'.format(self.opt.model_path))
                    self.save(self.opt.model_path)
            # NOTE: need better way
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            self.initialize_tensorboard(1000000 if max_trials == -1 else max_trials,
                                        name_postfix='.optimize')
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            while(max_trials):
                with log.supress_log_level(log.WARN):
                    raw_best_parameters = fmin(fn=self._optimize,
                                               space=space,
                                               algo=tpe.suggest,
                                               max_evals=len(self._optimization_info['trials'].trials) + 1,
                                               trials=self._optimization_info['trials'],
                                               show_progressbar=False)
                tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
                self.update_tensorboard_data(self._optimize_loss)
                tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
                iters += 1
                max_trials -= 1
                if self._optimization_info.get('best', {}).get('loss', 987654321) > self._optimize_loss['loss']:
                    is_first_time = self._optimization_info['best'] == {}
                    best_parameters = space_eval(space, raw_best_parameters)
                    best_loss = self._optimize_loss  # we cannot use return value of hyperopt due to randint behavior patch
                    self.logger.info(f'Found new best parameters: {best_parameters} @ iter {iters}')
                    self._optimization_info['best'] = best_loss
                    self._optimization_info['best_parameters'] = best_parameters
                    if opt.deployment and (is_first_time or not opt.min_trials or opt.min_trials >= iters):
                        if not self.opt.model_path:
                            raise RuntimeError('Failed to dump model: model path is not defined')
                        self.logger.info('Saving model... to {}'.format(self.opt.model_path))
                        self.save(self.opt.model_path)
                if self.optimize_after_callback_fn:
                    self.optimize_after_callback_fn(self)
                pbar.update(1)
                self.logger.debug('Params({}) Losses({})'.format(self._optimize_params, self._optimize_loss))
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            self.finalize_tensorboard()

            return self.get_optimization_data()
Beispiel #36
0
 def best(self):
     trials = Trials()
     best = fmin(self.f, self.space, algo=tpe.suggest, max_evals = self.max_evals, rstate= np.random.RandomState(self.seed), trials=trials)
     self.clf.set_params(**best)
     return self.clf, self.name, self.name_short, space_eval(self.space, best), self.best_acc
Beispiel #37
0
    def run(self):
        Atrails = Trials()
        trails = Trials()

        if self.adaptation == 'TCA':
            adptdefault_value = {
                'kernel_type': 'linear',
                'dim': 5,
                'lamb': 1,
                'gamma': 1
            }
            adptparamSpace = {
                'kernel_type':
                hp.choice('kernel_type', ['primal', 'linear', 'rbf', 'sam']),
                'dim':
                hp.choice('dim',
                          range(5, max(self.sx.shape[1], self.tx.shape[1]))),
                'lamb':
                hp.uniform('lamb', 1e-6, 1e2),
                'gamma':
                hp.uniform('gamma', 1e-5, 1e2)
            }

        if self.adaptation == 'DBSCANfilter':
            adptdefault_value = {'eps': 1, 'min_samples': 10}
            adptparamSpace = {
                'eps': hp.uniform('eps', 0.1, 1e2),
                'min_samples': hp.choice('min_samples', range(1, 100))
            }

        if self.adaptation == 'Bruakfilter':
            adptdefault_value = {'n_neighbors': 10}
            adptparamSpace = {
                'n_neighbors': hp.choice('n_neighbors', range(1, 100))
            }

        if self.adaptation == 'Peterfilter':
            adptdefault_value = {'eachCluster': 10}
            adptparamSpace = {
                'eachCluster': hp.choice('eachCluster', range(1, 100))
            }

        if self.adaptation == 'Universal':
            adptdefault_value = {'pvalue': 0.05, 'QuantifyType': 'cliff'}
            adptparamSpace = {
                'pvalue': hp.uniform('pvalue', 0.01, 0.1),
                'QuantifyType': hp.choice('QuantifyType', ['cliff', 'cohen'])
            }

        if self.adaptation == 'DTB':
            adptdefault_value = {'DTBneighbors': 10, 'DTBT': 20}
            adptparamSpace = {
                'DTBneighbors': hp.choice('DTBneighbors', range(1, 50)),
                'DTBT': hp.choice('DTBT', range(5, 30))
            }

        if self.adaptation == 'DS':
            adptdefault_value = {'DStopn': 5, 'DSfss': 0.2}
            adptparamSpace = {
                'DStopn': hp.choice('DStopn', range(1, 15)),
                'DSfss': hp.uniform('DSfss', 0.1, 0.5)
            }

        if self.adaptation == 'DSBF':
            adptdefault_value = {'DSBFtopk': 1, 'DSBFneighbors': 10}
            adptparamSpace = {
                'DSBFtopk': hp.choice('DSBFtopk', range(1, 10)),
                'DSBFneighbors': hp.choice('DSBFneighbors', range(1, 100))
            }

        if self.clf == 'Boost':
            clfdefault_value = {'Boostnestimator': 50, 'BoostLearnrate': 1}
            clfparamSpace = {
                'Boostnestimator': hp.choice('Boostnestimator',
                                             range(10, 1000)),
                'BoostLearnrate': hp.uniform('BoostLearnrate', 0.01, 10)
            }

        if self.clf == 'RF':
            clfdefault_value = {
                'n_estimators': 10,
                'criterion': 'gini',
                'max_features': 'auto',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 100)),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        if self.clf == 'SVM':
            clfdefault_value = {
                'SVCkernel': {
                    'kernel': 'poly',
                    'degree': 3,
                    'polycoef0': 0.0,
                    'polygamma': 1
                },
                'C': 1.0
            }
            clfparamSpace = {
                'SVCkernel':
                hp.choice(
                    'SVCkernel',
                    [{
                        'kernel': 'linear'
                    }, {
                        'kernel': 'poly',
                        'degree': hp.choice('degree', range(1, 5)),
                        'polycoef0': hp.uniform('polycoef0', 0, 10),
                        'polygamma': hp.choice('polygamma', ["auto", "scale"])
                    }, {
                        'kernel': 'sigmoid',
                        'sigcoef0': hp.uniform('sigcoef0', 0, 10),
                        'siggamma': hp.choice('siggamma', ["auto", "scale"])
                    }, {
                        'kernel': 'rbf',
                        'rbfgamma': hp.choice('rbfgamma', ["auto", "scale"])
                    }]),
                'C':
                hp.uniform('C', 0.001, 1000),
            }

        if self.clf == 'NN':
            clfdefault_value = {
                'NNactive': 'relu',
                'NNalpha': 0.0001,
                'NNmaxiter': 200
            }
            clfparamSpace = {
                'NNactive':
                hp.choice('NNactive',
                          ['identity', 'logistic', 'tanh', 'relu']),
                'NNalpha':
                hp.uniform('NNalpha', 1e-6, 1),
                'NNmaxiter':
                hp.choice('NNmaxiter', range(100, 1000))
            }

        if self.clf == 'KNN':
            clfdefault_value = {'KNNneighbors': 1}
            clfparamSpace = {
                'KNNneighbors': hp.choice('KNNneighbors', range(1, 50))
            }

        if self.clf == 'NB':
            clfdefault_value = {'NBType': 'gaussian'}
            clfparamSpace = {
                'NBType':
                hp.choice('NBType', ['gaussian', 'multinomial', 'bernoulli'])
            }

        if self.clf == 'Ridge':
            clfdefault_value = {'Ridgealpha': 1, 'Ridgenormalize': False}
            clfparamSpace = {
                'Ridgealpha': hp.uniform('Ridgealpha', 0.001, 1000),
                'Ridgenormalize': hp.choice('Ridgenormlize', [True, False])
            }

        if self.clf == 'CART':
            clfdefault_value = {
                'criterion': 'gini',
                'max_features': 'auto',
                'CARTsplitter': 'best',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'CARTsplitter':
                hp.choice('CARTsplitter', ['best', 'random']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        default_value = dict(adptdefault_value, **clfdefault_value)
        self.def_value = self.objFunc(default_value)['result']
        self.Adptbest = fmin(self.objFunc,
                             space=adptparamSpace,
                             algo=tpe.suggest,
                             max_evals=int(self.fe * 0.5),
                             trials=self.Atrails)
        self.Adptbest = space_eval(adptparamSpace, self.Adptbest)

        his = dict()
        try:
            his['name'] = list(
                self.Atrails.trials[0]['misc']['vals'].keys()) + list(
                    clfdefault_value.keys())
        except:
            his['name'] = [None]
        i = 0
        for item in self.Atrails.trials:
            if item['state'] == 2:
                results = list(deepflatten(
                    item['misc']['vals'].values())) + list(
                        clfdefault_value.values())
                results.append(item['result']['result'])
                his[i] = results
                i += 1

        self.SEQ = 1
        Clfbest = fmin(self.objFunc,
                       space=clfparamSpace,
                       algo=tpe.suggest,
                       max_evals=int(self.fe * 0.5),
                       trials=self.trails)

        try:
            his['name1'] = list(self.Adptbest.keys()) + list(
                self.trails.trials[0]['misc']['vals'].keys())
        except:
            his['name1'] = [None]
        for item in self.trails.trials:
            if item['state'] == 2:
                results = list(self.Adptbest.values()) + list(
                    deepflatten(item['misc']['vals'].values()))
                results.append(item['result']['result'])
                his[i] = results
                i += 1

        inc_value = self.trails.best_trial['result']['result']

        return np.asarray([self.def_value, inc_value]), his, Clfbest
Beispiel #38
0
    def param_optimize(self,
                       X_train,
                       y_train,
                       X_test,
                       y_test,
                       embedding_dim=(3, 6),
                       rnn_cells=[64, 128, 256, 512],
                       hidden_units_layers=(2, 4),
                       hidden_units_cells1=[64, 128, 256, 512],
                       hidden_units_cells2=[16, 32, 64, 128],
                       hidden_units_cells3=[4, 8, 16, 32],
                       hidden_units_cells4=[2, 4, 8],
                       activation=['swish', 'tanh', 'sigmoid', 'relu'],
                       dropout=(0.1, 0.8),
                       epochs=10,
                       batch_size=256,
                       max_evals=100):
        """
        通过贝叶斯调参寻找最优参数
        Args:
            X: 训练集X数据集
            y: 训练集y数据集
            X_test: 测试集X数据集
            y_test: 测试集y数据集
            embedding_dim: 离散数据embedding层输出维度,tuple类型,表示最小最大整数范围
            rnn_cells: 序列数据最终输出层神经元个数,list类型,指定枚举值
            hidden_units_layers: MLP层的层数,最多支持4层
            hidden_units_cells[i]: MLP每层神经元数
            activation: 激活函数枚举值
            dropout: dropout的调参范围,必须是0-1之间的实数
            #######以下非调优参数#######
            epochs: 指定epochs,不对该参数进行调优
            batch_size: 指定batch_size,不对该参数进行调优
            max_evals: 优化器最大迭代次数
        """
        # space = {
        #     'x': hp.uniform('x', 0, 1),  # 0-1的均匀分布
        #     'y': hp.normal('y', 0, 1),  # 0-1正态分布
        #     'name': hp.choice('name', ['alice', 'bob']), }  # 枚举值

        # 根据入参构造待调优参数列表
        space = {
            'embedding_dim':
            hp.choice('embedding_dim',
                      list(range(embedding_dim[0], embedding_dim[1] + 1))),
            'rnn_cells':
            hp.choice('rnn_cells', rnn_cells),
            'activation':
            hp.choice('activation', activation),
            'dropout':
            hp.uniform('dropout', dropout[0], dropout[1]),  # 0-1的均匀分布
            'hidden_units_cells1':
            hp.choice('hidden_units_cells1', hidden_units_cells1),
            'hidden_units_cells2':
            hp.choice('hidden_units_cells2', hidden_units_cells2),
            'hidden_units_cells3':
            hp.choice('hidden_units_cells3', hidden_units_cells3),
            'hidden_units_cells4':
            hp.choice('hidden_units_cells4', hidden_units_cells4),
            'hidden_units_layers':
            hp.choice(
                'hidden_units_layers',
                list(range(hidden_units_layers[0],
                           hidden_units_layers[1] + 1))),
        }

        # define an objective function
        def objective(args):
            print("开始优化迭代,本次参数为:")
            print(args)

            hidden_units_layers = []
            for i in range(1, args['hidden_units_layers'] + 1):
                hidden_units_layers.append(args['hidden_units_cells' + str(i)])

            self.compile(embedding_dim=args['embedding_dim'],
                         rnn_cells=args['rnn_cells'],
                         activation=args['activation'],
                         dropout=args['dropout'],
                         hidden_units=hidden_units_layers,
                         summary=False)

            self.fit(X_train,
                     y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     verbose=0,
                     callback=False)
            result = self.predict(X_test)
            auc = lapras.AUC(result.reshape(-1, ), y_test.reshape(-1, ))
            print("本次验证集AUC值为:" + str(auc))
            print(
                "==================================================================="
            )

            return -auc  # 优化目标AUC最大,也就是-AUC最小

        # minimize the objective over the space
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_evals)

        best_params = hyperopt.space_eval(space, best)
        print("优化完成,最优参数为:" + str(best_params))
        print("\n")
        print("开始按照最优参数重新训练模型……")

        hidden_units_layers = []
        for i in range(1, best_params['hidden_units_layers'] + 1):
            hidden_units_layers.append(best_params['hidden_units_cells' +
                                                   str(i)])

        self.compile(embedding_dim=best_params['embedding_dim'],
                     rnn_cells=best_params['rnn_cells'],
                     activation=best_params['activation'],
                     dropout=best_params['dropout'],
                     hidden_units=hidden_units_layers,
                     summary=False)

        self.fit(X_train,
                 y_train,
                 X_test,
                 y_test,
                 epochs=epochs,
                 batch_size=batch_size,
                 verbose=0)
        print("模型已按照最优参数重新训练,可直接使用")

        return best_params
Beispiel #39
0
def opt_params(X, y, session, list_session, classifier_name, ind_params,
               search_method, type_of_data, feature_select):

    # compute score for each param set
    def hyperopt_train_test(params):
        logo = LeaveOneGroupOut()  # create fold indices
        logo_generator = logo.split(X, y, session)
        #lolo_generator = logo.split(X,y, list_session)
        skf = StratifiedKFold(n_splits=5)
        cv_generator = skf.get_n_splits(X, y)

        params_joint = ind_params.copy()
        params_joint.update(params)
        if classifier_name == 'RF':
            clf = RF(**params_joint)

        if classifier_name == 'XGB':
            clf = xgb.XGBClassifier(**params_joint)

        if classifier_name == 'SVM':
            clf = svm.SVC(**params_joint)

        if classifier_name == 'L2':
            clf = LogisticRegression(**params_joint)

        if classifier_name == 'noisy_L2':
            clf = noisy_LogisticRegression(**params_joint)

        if classifier_name == 'L1':
            clf = LogisticRegression(**params_joint)

        n_sessions = len(np.unique(session))
        if n_sessions >= 2:
            cross_scores = cross_val_score(clf,
                                           X,
                                           y,
                                           cv=logo_generator,
                                           scoring='roc_auc',
                                           n_jobs=10)
        else:
            #cross_scores = cross_val_score(clf, X,y, cv = lolo_generator, n_jobs = 10, scoring = 'roc_auc')
            cross_scores = cross_val_score(clf,
                                           X,
                                           y,
                                           cv=cv_generator,
                                           n_jobs=10,
                                           scoring='roc_auc')
        return cross_scores.mean()

    # define search parameter space
    if classifier_name == 'RF':
        space4classifier = {
            'n_estimators':
            hp.choice('n_estimators', [500]),
            #'n_estimators': hp.qloguniform('n_estimators', 100,1000,1),
            'max_features':
            hp.choice('max_features', ['sqrt', 'log2', 0.2]),
            #'max_features': hp.choice('max_features', np.arange(100, X.shape[1],200)),
            'max_depth':
            hp.choice('max_depth', np.arange(4, 10, step=2)),
            'min_samples_leaf':
            hp.choice('min_samples_leaf', np.arange(5, 20, step=5))
        }
    if classifier_name == 'XGB':
        space4classifier = {
            'n_estimators': hp.choice('n_estimators',
                                      np.arange(50, 500, step=50)),
            #'n_estimators': hp.qloguniform('n_estimators', 5,10,1.0),
            'max_depth': hp.choice('max_depth', np.arange(4, 10, 1)),
            'learning_rate': hp.choice('learning_rate', [0.01, 0.1]),

            #'reg_lambda': hp.choice('reg_lambda', [0,0.01,0.05,0.1,1.0,10]),
            #'min_child_weight':hp.choice('min_child_weight',np.arange(3.0,7.0, step =2.0)),
            #'scale_pos_weight':hp.uniform('scale_pos_weight',1,10),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
            'subsample': hp.uniform('subsample', 0.7, 1.0)
        }

    if classifier_name == 'SVM':
        space4classifier = {
            #'C': hp.choice('C', 10.**np.arange(-2,10, step = 1.0)),
            'C': hp.loguniform('C', 0, 10),
            'gamma': hp.loguniform('gamma', -5, 5)
            #'gamma': hp.choice('gamma', ['auto']),
            #'kernel':hp.choice('kernel', ['rbf', 'poly', 'sigmoid']),
            #'degree':hp.choice('degree', np.arange(3,6))
        }

    if classifier_name == 'noisy_L2':
        space4classifier = {
            #'C': hp.choice('C', 10.**np.arange(-2,10, step = 1.0)),
            'C': hp.loguniform('C', -10, -4),
            'sigma_noise': hp.choice('sigma_noise', [0.01, 0.05, 0.1]),
            'noise_penalty': hp.loguniform('noise_penalty', -10, -4)
            #'gamma': hp.choice('gamma', ['auto']),
            #'kernel':hp.choice('kernel', ['rbf', 'poly', 'sigmoid']),
            #'degree':hp.choice('degree', np.arange(3,6))
        }

    if classifier_name == 'L2':
        #space4classifier ={'C': hp.choice('C',np.append(10.**np.arange(-5,-1,step =0.25), np.array(7.2e-4)))
        if type_of_data == 'long':
            space4classifier = {'C': hp.loguniform('C', -20, -10)}
        elif feature_select == 1:
            space4classifier = {'C': hp.loguniform('C', -20, -10)}
        else:
            space4classifier = {'C': hp.loguniform('C', -10, -5)}

        #'penalty': hp.choice('penalty',['l1','l2'])

    if classifier_name == 'L1':
        #space4classifier ={'C': hp.choice('C',np.append(10.**np.arange(-5,-1,step =0.25), np.array(7.2e-4)))
        if type_of_data == 'long':
            space4classifier = {'C': hp.loguniform('C', -15, -7)}
        elif feature_select == 1:
            space4classifier = {'C': hp.loguniform('C', -3, -2)}
        else:
            space4classifier = {'C': hp.loguniform('C', -3, -2)}

    global best
    best = 0.5

    def f(params):
        global best
        acc = hyperopt_train_test(params)
        if acc > best:
            best = acc
            print('new best:', best, params)
        return {'loss': -acc, 'status': STATUS_OK}

    #trials = Trials()  # saving trials
    if search_method == 'rand':
        best_params = fmin(f,
                           space4classifier,
                           algo=hyperopt.rand.suggest,
                           max_evals=20)

    if search_method == 'tpe':
        best_params = fmin(f,
                           space4classifier,
                           algo=hyperopt.tpe.suggest,
                           max_evals=20)

    return space_eval(space4classifier, best_params)
Beispiel #40
0
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

# Set algoritm parameters
with Timer('XGBoost, Search') as t:
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=20)
    best_params = space_eval(space, best)


# Print best parameters
best_params['max_depth'] = int(best_params['max_depth'])
print("BEST PARAMS: ", best_params)

clf = xgb.XGBClassifier(
    n_estimators=300,
    **best_params,
    tree_method='hist',
    eval_metric="auc",
    n_jobs=-1,
    scale_pos_weight=136
)
def run(train, y, test, v, z):
    np.random.seed(1)
    #cname = sys._getframe().f_code.co_name
    train = train.values
    test = test.values

    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    space_stack = hp.choice('stacking by', [
            dict( type = 'BayesianRidge' ),
            dict( type = 'Lars' ),
            dict( type = 'LinearRegression' ),
            dict( type = 'Ridge' ),
            dict( type = 'SGDRegressor', random_state = 1 ),
            dict( type = 'XGBRegressor',
                 max_depth = hp.choice('max_depth', range(2, 8)),
                 subsample = hp.quniform('subsample', 0.6, 1, 0.05),
                 colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
                 learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
                 min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
                 gamma = hp.quniform('gamma', 0, 10, 0.05),
                 reg_alpha = hp.quniform('alpha', 0, 1, 0.0001),
                 ),
                                            ])

    def get_lr(params):
        t = params['type']
        del params['type']

        if t == 'BayesianRidge':
            lr = linear_model.BayesianRidge(**params)
        elif t == 'Lars':
            lr = linear_model.Lars(**params)
        elif t == 'LinearRegression':
            lr = linear_model.LinearRegression(**params)
        elif t == 'Ridge':
            lr = linear_model.Ridge(**params)
        elif t == 'SGDRegressor':
            lr = linear_model.SGDRegressor(**params)
        elif t == 'XGBRegressor':
            lr = xgb.XGBRegressor(**params)

        return lr

    def step(params):
        print(params, end = ' ')
        cv = model_selection.cross_val_score(get_lr(params),
                                             train, y,
                                             cv=10,
                                             scoring=metrics.make_scorer(metrics.log_loss))
        score = np.mean(cv)
        print(score)
        return dict(loss=score, status=STATUS_OK)

    trs = state.load('trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        best = tr.argmin
        print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_stack, best))
    mt = max(50, len(tr.trials) + 1)
    while len(tr.trials) < min(50, mt):
        best = fmin(step, space_stack, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        state.save('trials', (tr, space_stack))
    params = space_eval(space_stack, best)

    print('best params:', params)
    lr = get_lr(params)
    cv = model_selection.cross_val_score(lr,
                                         train, y,
                                         cv=10,
                                         scoring=metrics.make_scorer(metrics.log_loss))
    lr.fit(train, y)
    z['p'] = np.clip(lr.predict(test), 1e-5, 1-1e-5)
    z['y'] = z['p']
    v['p'] = model_selection.cross_val_predict(lr,
                                         train, y,
                                         cv=10)
    print('cv:', np.mean(cv), np.std(cv))
    return cv, None
def bo_tpe_lightgbm(X, y):
    # 参考
    # https://qiita.com/TomokIshii/items/3729c1b9c658cc48b5cb

    data = X
    target = y
    # 2次数据划分,这样可以分成3份数据  test  train  validation
    X_intermediate, X_test, y_intermediate, y_test = train_test_split(
        data, target, shuffle=True, test_size=0.2, random_state=1)

    # train/validation split (gives us train and validation sets)
    X_train, X_validation, y_train, y_validation = train_test_split(
        X_intermediate,
        y_intermediate,
        shuffle=False,
        test_size=0.25,
        random_state=1)

    # delete intermediate variables
    del X_intermediate, y_intermediate

    # 显示数据集的分配比例
    print('train: {}% | validation: {}% | test {}%'.format(
        round((len(y_train) / len(target)) * 100, 2),
        round((len(y_validation) / len(target)) * 100, 2),
        round((len(y_test) / len(target)) * 100, 2)))

    starttime = datetime.datetime.now()

    space = {
        # 'learning_rate': hp.uniform('learning_rate', 0.001, 0.5),
        # 'minibatch_frac': hp.choice('minibatch_frac', [1.0, 0.5]),
        # 'Base': hp.choice('Base', [b1, b2, b3])
        "lambda_l1": hp.uniform("lambda_l1", 1e-8, 1.0),
        "lambda_l2": hp.uniform("lambda_l2", 1e-8, 1.0),
        "min_child_samples": hp.uniformint("min_child_samples", 5, 100),
        'learning_rate': hp.uniform("learning_rate", 0.001, 0.5),
        "n_estimators": hp.uniformint("n_estimators", 10, 100),
        "num_leaves": hp.uniformint("num_leaves", 5, 35)
    }

    # n_estimators表示一套参数下,有多少个评估器,简单说就是迭代多少次
    default_params = {
        # "n_estimators": 80,
        "random_state": 1,
        "objective": "regression",
        "boosting_type": "gbdt",
        # "num_leaves": 30,
        # "learning_rate": 0.3,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
    }

    def objective(params):
        #     下面这个是分类classification使用的模型,不能用在regressor
        #     dtrain = lgb.Dataset(X_train, label=y_train)
        params.update(default_params)
        clf = lgb.LGBMRegressor(**params)
        score = -np.mean(
            cross_val_score(clf,
                            X_train,
                            y_train,
                            cv=3,
                            n_jobs=-1,
                            scoring="neg_mean_squared_error"))
        return {'loss': score, 'status': STATUS_OK}

    trials_lgb = Trials()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest,
            # max_evals是设定多少套参数组合,组合数越大准确度可能更高但是训练的时间越长
            max_evals=50,
            trials=trials_lgb)

    best_params = space_eval(space, best)
    lgb_model = lgb.LGBMRegressor(**best_params).fit(
        X_train,
        y_train,
        eval_set=[(X_validation, y_validation)],
        verbose=-1,
        #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
        early_stopping_rounds=2)

    y_pred = lgb_model.predict(X_test)
    test_MSE_lgb = mean_squared_error(y_pred, y_test)
    print("LightGBM MSE score:%.4f" % test_MSE_lgb)
    endtime = datetime.datetime.now()
    process_time_lgb = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_lgb))
    print("最佳超参数值集合:", best_params)
    save_model_object(lgb_model, 'BO-TPE', 'NGBoost', 'NGBoost')
    return test_MSE_lgb, process_time_lgb, best_params
Beispiel #43
0
def main():

    usage = "%prog project label_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='LR',
                      help='Model: (LR|SVM|MNB|SVMNB); default=%default')
    parser.add_option('-f', dest='test_fold', default=0,
                      help='Test fold; default=%default')
    parser.add_option('-o', dest='output_prefix', default='bayes_opt',
                      help='Output prefix')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')
    parser.add_option('--alpha', dest='alpha', action="store_true", default=False,
                      help='Include alpha in search space (instead of grid search); default=%default')
    parser.add_option('--n_dev_folds', dest='n_dev_folds', default=None,
                      help='Number of dev folds to use when tuning/evaluating; default=%default')
    parser.add_option('-t', dest='target_col', default=2,
                      help='Index of column containing labels; default=%default')
    parser.add_option('-w', dest='weight_col', default=-1,
                      help='Index of column containing weights (-1 for None); default=%default')
    parser.add_option('-v', dest='verbose', default=1,
                      help='Level of verbosity; default=%default')
    parser.add_option('-n', dest='n_iter', default=60,
                      help='Number of iterations; default=%default')
    parser.add_option('-a', dest='add_pseudo', action="store_true", default=False,
                      help='Make use of pseudo-documents; default=%default')
    parser.add_option('--random', dest='random_search', action="store_true", default=False,
                      help='Use random search instead of TPE; default=%default')
    parser.add_option('--only_unanimous', dest='only_unanimous', action="store_true", default=False,
                      help='Use only the articles with unanimous agreement for evaluation; default=%default')



    #parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space, label_file, group, test_fold, n_dev_folds
    global weight_col, verbose, target, add_pseudo, only_unanimous

    project = args[0]
    dirs.make_base_dir(project)

    label_file = args[1]
    reuse = options.reuse
    search_alpha = options.alpha
    #n_codes = int(options.n_codes)
    output_prefix = options.output_prefix
    model = options.model
    test_fold = int(options.test_fold)
    if options.n_dev_folds is not None:
        n_dev_folds = int(options.n_dev_folds)
    else:
        n_dev_folds = None
    weight_col = int(options.weight_col)
    target = int(options.target_col)
    verbose = int(options.verbose)
    n_iter = int(options.n_iter)
    add_pseudo = options.add_pseudo
    random_search = options.random_search
    only_unanimous = options.only_unanimous

    # allow user to specfiy a particular choice of model
    if model == 'LR':
        space['model'] = {
            'model': 'LR',
            'regularization': hp.choice('regularization', ['l1', 'l2'])
            #'regularization': 'l1'
        }
    elif model == 'SVM':
        space['model'] = {
            'model': 'SVM',
            'kernel': hp.choice('ktype', [
                {'ktype': 'linear'},
                {'ktype': 'poly', 'degree': hp.choice('degree', [2, 3, 4])},
                {'ktype': 'rbf'}
            ]
                                )
        }
    elif model == 'MNB':
        space['model'] = {
            'model': 'MNB'
        }
    elif model == 'SVMNB':
        space['model'] = {
            'model': 'SVMNB',
            'beta': hp.uniform('beta', 0, 1)
        }
    else:
        sys.exit('Choice of model not supported!')

    if add_pseudo:
        add_pseudo_options()

    output_prefix += '_' + model

    if search_alpha:
        space['alpha'] = hp.loguniform('alpha', -3, 10)
        output_prefix += '_alpha'
    else:
        output_prefix += '_noalpha'

    if reuse:
        output_prefix += '_reuse'

    all_items, target_name, labels, weights, _ = lr.get_labels(label_file, target, weight_col=weight_col)
    output_dirname = experiment.make_exp_dir(test_fold, target_name, output_prefix)
    basedir = os.path.split(output_dirname)[0]
    output_filename = fh.make_filename(basedir, output_prefix, 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        output_file.write('reuse = ' + str(reuse) + '\n')
        output_file.write('search alphas = ' + str(search_alpha) + '\n')

    trials = Trials()

    if random_search:
        best = fmin(call_experiment,
                    space=space,
                    algo=rand.suggest,
                    max_evals=n_iter,
                    trials=trials)
    else:
        best = fmin(call_experiment,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=n_iter,
                    trials=trials)

    print space_eval(space, best)
    print trials.losses()
    def model(self):
        #cname = sys._getframe().f_code.co_name
        cname = 'lgb'
        train, y, test = self.train_, self.y_, self.test_
        train.drop('id', axis=1, inplace=True)
        test.drop('id', axis=1, inplace=True)
        from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
        dtrain = lgb.Dataset(train, label=y)
        def fix_params(params):
            for p in ['min_data_in_leaf', 'num_leaves', 'max_bin' ]:
                params[p] = int(params[p])
            params['num_leaves'] = max(params['num_leaves'], 2)
        def step_xgb(params):
            fix_params(params)
            cv = lgb.cv(params, dtrain,
                        num_boost_round=10000,
                        early_stopping_rounds=50,
                        nfold=6,
                        seed=params['seed'])
            rounds = np.argmin(cv['binary_logloss-mean'])
            score = np.min(cv['binary_logloss-mean'])
            print(cname, score, rounds, params, self.now())
            return dict(loss=score, status=STATUS_OK)
        space_lgb = dict(
                bagging_fraction = hp.quniform('bagging_fraction', 0.5, 1, 0.001),
                colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
                feature_fraction = hp.quniform('feature_fraction', 0.5, 1, 0.001),
                lambda_l1 = hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
                lambda_l2 = hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
                learning_rate = hp.loguniform('learning_rate', -7, 0),
                max_bin = hp.qloguniform('max_bin', 0, 20, 1),
                max_depth = hp.choice('max_depth', range(2, 9)),
                min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
                min_data_in_leaf = hp.qloguniform('min_data_in_leaf', 0, 6, 1),
                min_sum_hessian_in_leaf = hp.loguniform('min_sum_hessian_in_leaf', -16, 5),
                num_leaves = hp.qloguniform('num_leaves', 2, 7, 1),
                reg_alpha = hp.quniform('reg_alpha', 0, 1, 0.001),
                subsample = hp.quniform('subsample', 0.6, 1, 0.05),

                bagging_freq = 1,
                objective = 'binary',
                metric = 'binary_logloss',
                seed = 1,
                #silent = 1,
            )
        trs = self.load('lightgbm_trials')
        if trs == None or self.debug_:
            tr = Trials()
        else:
            tr, _ = trs
        if len(tr.trials) > 0:
            print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_lgb, tr.argmin))
            best = tr.argmin
        while len(tr.trials) < self.max_ho_trials_:
            print(len(tr.trials), end=' ')
            #best = fmin(step_xgb, space_lgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
            best = fmin(step_xgb, space_lgb, algo=partial(tpe.suggest, n_startup_jobs=1), max_evals=len(tr.trials) + 1, trials = tr)
            self.save('lightgbm_trials', (tr, space_lgb))
        lgb_params = space_eval(space_lgb, best)
        fix_params(lgb_params)
        print(lgb_params)

        N_splits = self.num_splits_
        N_seeds = self.num_seeds_

        v, z = self.v_, self.z_
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
        cv = []
        for s in range(N_seeds):
            scores = []
            cname2 = cname + str(s)
            v[cname2], z[cname2] = 0, 0
            lgb_params['seed'] = s + self.base_seed_
            for n, (itrain, ival) in enumerate(skf.split(train, y)):
                dtrain = lgb.Dataset(train.ix[itrain], y[itrain])
                dvalid = lgb.Dataset(train.ix[ival], y[ival])
                clf = lgb.train(lgb_params, dtrain,
                                num_boost_round=10000,
                                valid_sets=[dtrain, dvalid],
                                valid_names=['train', 'valid'],
                                early_stopping_rounds=100, verbose_eval=False)

                p = clf.predict(train.ix[ival])
                v.loc[ival, cname2] += p
                score = metrics.log_loss(y[ival], p)
                z[cname2]  += clf.predict(test)
                print(cname, 'seed %d step %d of %d: '%(lgb_params['seed'], n+1, skf.n_splits), score, self.now())
                scores.append(score)
            z[cname2] /= N_splits
            cv.append(np.mean(scores))
            print('seed %d loss: '%(lgb_params['seed']), scores, np.mean(scores), np.std(scores))
            z['y'] = z[cname2]

        print('cv:', cv, np.mean(cv), np.std(cv))
        return cv, None
Beispiel #45
0
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2


if __name__ == '__main__':
    trials = Trials()

    # define a search space
    space = hp.choice('a',
                      [
                          ('case 1', 1 + hp.lognormal('c1', 0, 1)),
                          ('case 2', hp.uniform('c2', -10, 10))
                      ])

    # minimize the objective over the space
    best = fmin(objective, space, algo=tpe.suggest, max_evals=250, trials=trials)

    print best
    # -> {'a': 1, 'c2': 0.01420615366247227}
    print hyperopt.space_eval(space, best)
    # -> ('case 2', 0.01420615366247227}
 def optimize(self):
     logging.warning('Start optimization for:' + self.__class__.__name__)
     evals = Configuration.HYPEROPT_EVALS_PER_SEARCH
     result = fmin(fn=self._objective, space=self._hyper_space, algo=tpe.suggest, max_evals=evals)
     return space_eval(self._hyper_space, result)
from sklearn import datasets
from hyperopt import fmin, tpe, hp, Trials
import numpy as np
import hyperopt
import space
import clf_objective_function

get_space = space.get_space
classifier_objective = clf_objective_function.classifier_objective

iris = datasets.load_iris()
X, y = iris.data, iris.target
X, y = X[y != 0, :2], y[y != 0]
X_og, y_og = X, y

trials = Trials()
best = fmin(fn=lambda x: classifier_objective(x, X, y),
            space=get_space(),
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print '\n\n'
print best
config = hyperopt.space_eval(get_space(), best)
print classifier_objective(config, X, y)
# print trials.results
def bo_tpe_ngb(X, y):
    # 参考例子
    # https://github.com/stanfordmlgroup/ngboost/blob/master/examples/tuning/hyperopt.ipynb
    data = X
    target = y
    # 2次数据划分,这样可以分成3份数据  test  train  validation
    X_intermediate, X_test, y_intermediate, y_test = train_test_split(
        data, target, shuffle=True, test_size=0.2, random_state=1)

    # train/validation split (gives us train and validation sets)
    X_train, X_validation, y_train, y_validation = train_test_split(
        X_intermediate,
        y_intermediate,
        shuffle=False,
        test_size=0.25,
        random_state=1)

    # delete intermediate variables
    del X_intermediate, y_intermediate

    # 显示数据集的分配比例
    print('train: {}% | validation: {}% | test {}%'.format(
        round((len(y_train) / len(target)) * 100, 2),
        round((len(y_validation) / len(target)) * 100, 2),
        round((len(y_test) / len(target)) * 100, 2)))

    starttime = datetime.datetime.now()

    # 搜索空间设定
    b1 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=2)
    b2 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3)
    b3 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4)

    space = {
        'learning_rate': hp.uniform('learning_rate', 0.001, 0.5),
        'minibatch_frac': hp.choice('minibatch_frac', [1.0, 0.5]),
        'Base': hp.choice('Base', [b1, b2, b3])
    }

    # n_estimators表示一套参数下,有多少个评估器,简单说就是迭代多少次
    default_params = {"n_estimators": 20, "verbose_eval": 1, "random_state": 1}

    def objective(params):
        params.update(default_params)
        ngb = NGBRegressor(**params, verbose=False).fit(
            X_train,
            y_train,
            X_val=X_validation,
            Y_val=y_validation,
            #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
            early_stopping_rounds=2)
        loss = ngb.evals_result['val']['LOGSCORE'][ngb.best_val_loss_itr]
        results = {'loss': loss, 'status': STATUS_OK}
        return results

    trials_ngb = Trials()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest,
            # max_evals是设定多少套参数组合,组合数越大准确度可能更高但是训练的时间越长
            max_evals=50,
            trials=trials_ngb)

    best_params = space_eval(space, best)

    ngb_new = NGBRegressor(**best_params, verbose=False).fit(
        X_train,
        y_train,
        X_val=X_validation,
        Y_val=y_validation,
        #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
        early_stopping_rounds=2)

    y_pred = ngb_new.predict(X_test)
    test_MSE_ngb = mean_squared_error(y_pred, y_test)
    print("NGBoost MSE score:%.4f" % test_MSE_ngb)
    endtime = datetime.datetime.now()
    process_time_ngb = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_ngb))
    print("最佳超参数值集合:", best_params)
    save_model_object(ngb_new, 'BO-TPE', 'NGBoost', 'NGBoost')
    return test_MSE_ngb, process_time_ngb, best_params
Beispiel #49
0
def main(main_args):

    usage = "%prog project label_file splits_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='LR',
                      help='Model: (LR|SVM|MNB|SVMNB); default=%default')
    parser.add_option('-o', dest='output_prefix', default='bayes_opt',
                      help='Output prefix')
    parser.add_option('--alpha', dest='alpha', action="store_true", default=False,
                      help='Include alpha in search space (instead of grid search); default=%default')
    parser.add_option('--n_dev_folds', dest='n_dev_folds', default=None,
                      help='Number of dev folds to use when tuning/evaluating; default=%default')
    parser.add_option('-t', dest='target_col', default=2,
                      help='Index of column containing labels; default=%default')
    parser.add_option('-w', dest='weight_col', default=-1,
                      help='Index of column containing weights (-1 for None); default=%default')
    parser.add_option('-v', dest='verbose', default=1,
                      help='Level of verbosity; default=%default')
    parser.add_option('-n', dest='n_iter', default=2,
                      help='Number of iterations; default=%default')
    parser.add_option('-a', dest='add_pseudo', action="store_true", default=False,
                      help='Make use of pseudo-documents; default=%default')
    parser.add_option('--random', dest='random_search', action="store_true", default=False,
                      help='Use random search instead of TPE; default=%default')
    parser.add_option('--personas_old', dest='personas_old', action="store_true", default=False,
                      help='Use personas from the DPM; default=%default')
    parser.add_option('--personas_new', dest='personas_new', action="store_true", default=False,
                      help='Use personas from my model; default=%default')
    parser.add_option('--story_types', dest='story_types', action="store_true", default=False,
                      help='Use story types from my model; default=%default')



    #parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()
    if len(args) == 0:
        args = main_args

    global output_dirname, output_filename, search_alpha, space, label_file, group, n_dev_folds
    global weight_col, verbose, target, add_pseudo, output_prefix
    global personas_old, personas_new, story_types

    project = args[0]
    label_file = args[1]
    splits_file = args[2]

    dirs.set_project(project, splits_file)

    search_alpha = options.alpha
    output_prefix = options.output_prefix
    model = options.model
    if options.n_dev_folds is not None:
        n_dev_folds = int(options.n_dev_folds)
    else:
        n_dev_folds = None
    weight_col = int(options.weight_col)
    target = int(options.target_col)
    verbose = int(options.verbose)
    n_iter = int(options.n_iter)
    add_pseudo = options.add_pseudo
    random_search = options.random_search
    personas_old = options.personas_old
    personas_new = options.personas_new
    story_types = options.story_types

    # allow user to specfiy a particular choice of model
    if model == 'LR':
        space['model'] = {
            'model': 'LR',
            #'regularization': hp.choice('regularization', ['l1', 'l2'])
            'regularization': 'l1'
        }
    elif model == 'SVM':
        space['model'] = {
            'model': 'SVM',
            'kernel': hp.choice('ktype', [
                {'ktype': 'linear'},
                {'ktype': 'poly', 'degree': hp.choice('degree', [2, 3, 4])},
                {'ktype': 'rbf'}
            ]
                                )
        }
    elif model == 'MNB':
        space['model'] = {
            'model': 'MNB'
        }
    elif model == 'SVMNB':
        space['model'] = {
            'model': 'SVMNB',
            'beta': hp.uniform('beta', 0, 1)
        }
    else:
        sys.exit('Choice of model not supported!')

    if personas_old:
        space['features']['personas'] = hp.choice('personas', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': hp.choice('personas_transform', ['binarize', 'normalizel1']),
                'subdir': 'personas',
                'source': 'personasdpm',
            }
        ])
    if personas_new:
        space['features']['personas'] = hp.choice('personas', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': hp.choice('personas_transform', ['binarize', 'normalizel1']),
                'subdir': 'personas',
                'source': 'personas',
            }
        ])
    if story_types:
        space['features']['storytypes'] = hp.choice('storytypes', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': 'normalizel1',
                'subdir': 'personas',
                'source': 'storytypesold',
            }
        ])


    if add_pseudo:
        add_pseudo_options()

    output_prefix += '_' + model

    if search_alpha:
        space['alpha'] = hp.loguniform('alpha', -3, 10)
        output_prefix += '_alpha'
    else:
        output_prefix += '_noalpha'


    all_items, target_name, labels, weights, _ = lr.get_labels(label_file, target, weight_col=weight_col)
    #for t in range(10):
    #    output_dirname = experiment2.make_exp_dir(t, target_name, output_prefix)
    output_dirname = fh.makedirs(dirs.exp_dir, target_name)
    output_filename = os.path.join(output_dirname, output_prefix + '.log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        output_file.write('search alphas = ' + str(search_alpha) + '\n')

    trials = Trials()

    if random_search:
        best = fmin(call_experiment,
                    space=space,
                    algo=rand.suggest,
                    max_evals=n_iter,
                    trials=trials)
    else:
        best = fmin(call_experiment,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=n_iter,
                    trials=trials)

    print space_eval(space, best)
    print trials.losses()
Beispiel #50
0
    }


df = data()
df = df[df["ad"] == outlier]
agg = df.groupby("ad").sum().iloc[outlier]["av"]

# define a search space
a1_min = min(list(df["a1"].unique()))
a1_max = max(list(df["a1"].unique()))
a2_min = min(list(df["a2"].unique()))
a2_max = max(list(df["a2"].unique()))
q = 1

space = hp.choice(
    "parameters",
    [
        {
            "a1_lower": hp.quniform("a1_lower", a1_min, a1_max, q),
            "a1_upper": hp.quniform("a1_upper", a1_min, a1_max, q),
            "a2_lower": hp.quniform("a2_lower", a2_min, a2_max, q),
            "a2_upper": hp.quniform("a2_upper", a2_min, a2_max, q),
        },
    ],
)

trials = Trials()
# minimize the objective over the space
best = fmin(objective, space, algo=tpe.suggest, max_evals=1000, trials=trials)
print(space_eval(space, best))
	
	x = list( x )
	x.insert( 0, error )
	writer.writerow( x )
	
	return error

###

space = ( 
	hp.qloguniform( 'l1_dim', log( 10 ), log( 1000 ), 1 ), 
	hp.qloguniform( 'l2_dim', log( 10 ), log( 1000 ), 1 ),
	hp.loguniform( 'learning_rate', log( 1e-5 ), log( 1e-2 )),
	hp.uniform( 'momentum', 0.5, 0.99 ),
	hp.uniform( 'l1_dropout', 0.1, 0.9 ),
	hp.uniform( 'decay_factor', 1 + 1e-3, 1 + 1e-1 )
)

run_counter = 0
start_clock = clock()

output_file = 'results.csv'
writer = csv.writer( open( output_file, 'wb' ))

best = fmin( run_test, space, algo = tpe.suggest, max_evals = 50 )

print best
print run_test( hyperopt.space_eval( space, best ))

print "Seconds", clock() - start_clock
            'C': hp.uniform('C', -15, 15),
            'gamma': hp.uniform('gamma', -15, 15)
        }

    # optimize parameters
    trial_nr = 0
    trials = Trials()
    all_results = []
    best = fmin(create_and_evaluate_model,
                space,
                algo=tpe.suggest,
                max_evals=n_iter,
                trials=trials)

    # extract the best parameters
    best_params = hyperopt.space_eval(space, best)

    # write to file
    outfile = os.path.join(
        params_dir, "optimal_params_%s_%s_%s.pickle" %
        (cls_method, dataset_name, method_name))
    with open(outfile, "wb") as fout:
        pickle.dump(best_params, fout)

    dt_results = pd.DataFrame(all_results,
                              columns=[
                                  "iter", "param", "value", "nr_events", "auc",
                                  "rmspd", "score"
                              ])
    dt_results["dataset"] = dataset_name
    dt_results["cls"] = cls_method
Beispiel #53
0
        'gamma':hp.choice('gamma',[0.001, 0.01, 0.1, 1])
        }

# Trials
trials = Trials()

# Set optimization algorithm
algo = partial(tpe.suggest, n_startup_jobs=10, gamma=0.25)

best = fmin(fn=objective_func,
        space=space,
        algo=algo,
        max_evals=100,
        trials=trials)

best_parameters = space_eval(space, best)
print("Best parameters:", str(best_parameters))

# Print time
tdiff = trials.trials[-1]['book_time'] - trials.trials[0]['book_time']
print("Time:" + str(tdiff.total_seconds() / 60))

# Set params
clf = SVC(C=best_parameters['C'], kernel=best_parameters['kernel'], gamma=best_parameters['gamma'], degree=best_parameters['degree'])

# Fit
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Predict
score = classification_report(y_pred, y_test)
Beispiel #54
0
def hyperopt_lightgbm(X_train: pd.DataFrame, y_train: pd.Series,
                      X_val: pd.DataFrame, y_val: pd.Series, params: Dict,
                      config: Config):

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.01),
                                       np.log(0.2)),
        #"max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]),
        "num_leaves": hp.choice("num_leaves", np.linspace(16, 64, 4,
                                                          dtype=int)),
        #"feature_fraction": hp.quniform("feature_fraction", 0.5, 1.0, 0.1),
        #"bagging_fraction": hp.quniform("bagging_fraction", 0.5, 1.0, 0.1),
        #"bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 1, dtype=int)),
        # "reg_alpha": hp.uniform("reg_alpha", 0, 2),
        # "reg_lambda": hp.uniform("reg_lambda", 0, 2),
        # "min_child_weight": hp.uniform('min_child_weight', 0.5, 10),
        # "scale_pos_weight": hp.uniform('x', 0, 5),
    }

    def objective(hyperparams):
        model = lgb.train({
            **params,
            **hyperparams
        },
                          train_data,
                          500,
                          valid_data,
                          early_stopping_rounds=100,
                          verbose_eval=0)

        score = model.best_score["valid_0"][params["metric"]]

        feature_importance_df = pd.DataFrame()
        feature_importance_df["features"] = X_train.columns
        feature_importance_df["importance_gain"] = model.feature_importance(
            importance_type='gain')
        record_zero_importance = feature_importance_df[
            feature_importance_df["importance_gain"] == 0.0]
        to_drop = list(record_zero_importance['features'])

        # in classification, less is better
        return {
            'loss': -score,
            'status': STATUS_OK,
            "drop_feature": to_drop,
            "best_iter": model.best_iteration
        }

    trials = Trials()
    best = hyperopt.fmin(fn=objective,
                         space=space,
                         trials=trials,
                         algo=tpe.suggest,
                         max_evals=10,
                         verbose=1,
                         rstate=np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    log(f"hyperopt auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}"
        )
    drop_feature = set(X_train.columns.tolist())
    for result in trials.results:
        drop_feature = drop_feature & set(result['drop_feature'])
    return hyperparams, drop_feature, trials.best_trial['result']['best_iter']
        h_score += tmp.sum()*1.0/(cnt*(cnt-1))
    h_score = h_score/numtopics
    '''

    spacetosearch = {
        'numsentilabel': hp.choice('numsentilabel', [10, 20, 50]),
        'numtopics': hp.choice('numtopics', [10, 20, 50]),
    }

    trials = Trials()
    best = fmin(f1,
                spacetosearch,
                algo=tpe.suggest,
                max_evals=9,
                trials=trials)
    best_params1 = space_eval(spacetosearch, best)
    print('best {} with params {}'.format(best, best_params1))
    numsentilabel, numtopics = best_params1['numsentilabel'], best_params1[
        'numtopics']
    run_experiment(numsentilabel, numtopics, alpha, beta, gamma, maxiter,
                   numwordspertopic)

    spacetosearch = {
        'numwordspertopic': hp.choice('numwordspertopic', [5, 10, 20, 25, 50])
    }

    trials = Trials()
    best = fmin(fn=f2,
                space=spacetosearch,
                algo=tpe.suggest,
                trials=trials,
Beispiel #56
0
                validation_step(post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val)
                print("\n")
                min_mse_val = np.inf
                for i in range(FLAGS.epochs):
                    batches = get_batch(train_data, FLAGS.batch_size)
                    for batch in batches:
                        post_text_batch, truth_class_batch, post_text_len_batch, truth_mean_batch, target_description_batch, target_description_len_batch, image_feature_batch = zip(*batch)
                        train_step(post_text_batch, truth_class_batch, post_text_len_batch, truth_mean_batch, target_description_batch, target_description_len_batch, image_feature_batch)
                    print("\nValidation: ")
                    mse_val = validation_step(post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val)
                    print("\n")
                    if mse_val < min_mse_val:
                        min_mse_val = mse_val
                        # saver.save(sess, checkpoint_prefix)
        round += 1
        val_scores.append(min_mse_val)
    return np.mean(val_scores)


if __name__ == "__main__":
    space = {
        "batch_size": hyperopt.hp.choice("batch_size", [16, 32, 64, 128]),
        "dropout_rate_hidden": hyperopt.hp.choice("dropout_rate_hidden", [0.3, 0.5, 0.7]),
        "learning_rate": hyperopt.hp.choice("learning_rate", [0.001, 0.005, 0.01, 0.05]),
        "gradient_clipping_value": hyperopt.hp.choice("gradient_clipping_value", [0.5, 1, 2, 5, 10])
    }
    best_model = hyperopt.fmin(main, space, algo=hyperopt.tpe.suggest, max_evals=100)
    print(best_model)
    print(hyperopt.space_eval(space, best_model))

Beispiel #57
0
def main():

    usage = "%prog <DRLD|MIP|MOLD|Primary|General|Terrorist|PK-Brown|PK-Roberts|PK-Pelosi|PK-Cheney>"
    parser = OptionParser(usage=usage)
    parser.add_option("-m", dest="model", default="LR", help="Model: (LR|SVM|MNB|SVMNB); default=%default")
    parser.add_option("-t", dest="test_fold", default=0, help="Test fold; default=%default")
    parser.add_option("-o", dest="output_dirname", default="bayes_opt", help="Output directory name")
    parser.add_option(
        "--reuse", dest="reuse", action="store_true", default=False, help="Use reusable holdout; default=%default"
    )
    parser.add_option(
        "--alpha",
        dest="alpha",
        action="store_true",
        default=False,
        help="Include alpha in search space (instead of grid search); default=%default",
    )
    parser.add_option(
        "--n_dev_folds",
        dest="n_dev_folds",
        default=5,
        help="Number of dev folds to use when tuning/evaluating; default=%default",
    )

    # parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space, run, group, test_fold, n_dev_folds

    run = args[0]
    reuse = options.reuse
    search_alpha = options.alpha
    # n_codes = int(options.n_codes)
    output_dirname = options.output_dirname
    model = options.model
    test_fold = int(options.test_fold)
    n_dev_folds = int(options.n_dev_folds)

    # allow user to specfiy a particular choice of model
    if model == "LR":
        space["model"] = {
            "model": "LR",
            #'regularization': hp.choice('regularization', ['l1', 'l2'])
            "regularization": "l1",
        }
    elif model == "SVM":
        space["model"] = {
            "model": "SVM",
            "kernel": hp.choice(
                "ktype",
                [{"ktype": "linear"}, {"ktype": "poly", "degree": hp.choice("degree", [2, 3, 4])}, {"ktype": "rbf"}],
            ),
        }
    elif model == "MNB":
        space["model"] = {"model": "MNB"}
    elif model == "SVMNB":
        space["model"] = {"model": "SVMNB", "beta": hp.uniform("beta", 0, 1)}
    else:
        sys.exit("Choice of model not supported!")

    if run == "DRLD":
        add_drld()
        group = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"]
        n_codes = 33
    elif run == "MIP":
        add_MIP()
        group = ["MIP-Personal-1", "MIP-Personal-2", "MIP-Political-1", "MIP-Political-2"]
        n_codes = 74
    elif run == "MOLD":
        add_MOLD()
        group = ["McCain-Likes", "McCain-Dislikes", "Obama-Likes", "Obama-Dislikes"]
        n_codes = 34
    elif run == "Primary":
        add_obama()
        add_clinton()
        group = ["Obama-Primary", "Clinton-Primary"]
        n_codes = 42
    elif run == "General":
        add_obama()
        add_mccain()
        group = ["Obama-General", "McCain-General"]
        n_codes = 41
    elif run == "Terrorists":
        group = [run]
        n_codes = 28
    elif run == "PK-Brown":
        group = [run]
        n_codes = 14
    elif run == "PK-Cheney":
        group = [run]
        n_codes = 12
    elif run == "PK-Pelosi":
        group = [run]
        n_codes = 15
    elif run == "PK-Roberts":
        group = [run]
        n_codes = 14
    else:
        sys.exit("Dataset not recognized")

    output_dirname += "_" + model

    if search_alpha:
        space["alphas"] = []
        for i in range(n_codes):
            space["alphas"].append(hp.loguniform("alpha" + str(i), -1.15, 9.2))
        output_dirname += "_alphas"

    if reuse:
        output_dirname += "_reuse"
    else:
        output_dirname += "_noreuse"
    output_dirname += "_" + run

    if n_dev_folds != 5:
        output_dirname += "_" + str(n_dev_folds)

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), "log")

    with codecs.open(output_filename, "w") as output_file:
        output_file.write(output_dirname + "\n")
        output_file.write("reuse = " + str(reuse) + "\n")
        output_file.write("search alphas = " + str(search_alpha) + "\n")

    trials = Trials()
    best = fmin(call_experiment, space=space, algo=tpe.suggest, max_evals=40, trials=trials)

    print space_eval(space, best)
    print trials.losses()
Beispiel #58
0
    def _train_on_dataset(
        self,
        dataset,
        train_size=0.67,
        batch_size=64,
        num_epochs=100,
        num_cpus=1,
        num_gpus=0,
        max_evals=None,
        progress_cb=None,
        abnormal=None,
    ):
        if max_evals is None:
            max_evals = self.settings.get('max_evals',
                                          21)  # latent_dim*intermediate_dim

        self.current_eval = 0

        self.stat_dataset(dataset)
        dataset = self.scale_dataset(dataset)

        def cross_val_model(params):
            keras_model = None
            # Destroys the current TF graph and creates a new one.
            # Useful to avoid clutter from old models / layers.
            K.clear_session()
            self._set_xpu_config(num_cpus, num_gpus)

            self.span = W = params.span
            (X_miss, X_train), (X_miss_val, X_test) = self.train_test_split(
                dataset,
                train_size=train_size,
                abnormal=abnormal,
            )
            if len(X_train) == 0:
                raise errors.NoData("insufficient training data")
            if len(X_test) == 0:
                raise errors.NoData("insufficient validation data")

            # expected input data shape: (batch_size, timesteps,)
            # network parameters
            input_shape = (W, )
            intermediate_dim = params.intermediate_dim
            latent_dim = params.latent_dim

            # VAE model = encoder + decoder
            # build encoder model
            main_input = Input(shape=input_shape)
            aux_input = Input(
                shape=input_shape)  # bool vector to flag missing data points
            aux_output = Lambda(lambda x: x)(aux_input)
            x = Dense(intermediate_dim,
                      kernel_regularizer=regularizers.l2(0.01),
                      activation='relu')(main_input)
            z_mean = Dense(latent_dim, name='z_mean')(x)
            z_log_var = Dense(latent_dim, name='z_log_var')(x)

            # use reparameterization trick to push the sampling out as input
            # note that "output_shape" isn't necessary with the TensorFlow backend
            z = Lambda(sampling, output_shape=(latent_dim, ),
                       name='z')([z_mean, z_log_var])

            # build decoder model
            x = Dense(intermediate_dim,
                      kernel_regularizer=regularizers.l2(0.01),
                      activation='relu',
                      name='dense_1')(z)
            main_output = Dense(W, activation='linear', name='dense_2')(x)

            # instantiate Donut model
            keras_model = _Model([main_input, aux_input],
                                 [main_output, aux_output],
                                 name='donut')
            add_loss(keras_model, W)
            optimizer_cls = None
            if params.optimizer == 'adam':
                optimizer_cls = tf.keras.optimizers.Adam()

            keras_model.compile(optimizer=optimizer_cls, )

            _stop = EarlyStopping(
                monitor='val_loss',
                patience=5,
                verbose=_verbose,
                mode='auto',
            )
            keras_model.fit_generator(
                generator(X_train, X_miss, batch_size, keras_model),
                epochs=num_epochs,
                steps_per_epoch=len(X_train) / batch_size,
                verbose=_verbose,
                validation_data=([X_test, X_miss_val], None),
                callbacks=[_stop],
                workers=0,  # https://github.com/keras-team/keras/issues/5511
            )

            # How well did it do?
            score = keras_model.evaluate(
                [X_test, X_miss_val],
                batch_size=batch_size,
                verbose=_verbose,
            )

            self.current_eval += 1
            if progress_cb is not None:
                progress_cb(self.current_eval, max_evals)

            return score, keras_model

        hyperparameters = HyperParameters()

        # Parameter search space
        def objective(args):
            hyperparameters.assign(args)

            try:
                score, _ = cross_val_model(hyperparameters)
                return {'loss': score, 'status': STATUS_OK}
            except Exception as exn:
                logging.warning("iteration failed: %s", exn)
                return {'loss': None, 'status': STATUS_FAIL}

        space = hp.choice('case', [{
            'span':
            self.get_hp_span('span'),
            'latent_dim':
            hp.choice('latent_dim', [3, 5, 8]),
            'intermediate_dim':
            hp.choice('i1', [21, 34, 55, 89, 144, 233, 377]),
            'optimizer':
            hp.choice('optimizer', ['adam']),
        }])

        # The Trials object will store details of each iteration
        trials = Trials()

        # Run the hyperparameter search using the tpe algorithm
        try:
            best = fmin(
                objective,
                space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials,
            )
        except ValueError:
            raise errors.NoData(
                "training failed, try to increase the time range")

        # Get the values of the optimal parameters
        best_params = space_eval(space, best)
        score, self._keras_model = cross_val_model(
            HyperParameters(best_params))
        self.span = best_params['span']
        return (best_params, score)
    def model(self):
        #cname = sys._getframe().f_code.co_name
        if self.preselect_features_:
            self.greedy_select_features()

        train = self.train_.values
        test = self.test_.values
        y = self.y_

        self.baseline_score_, _ = self.ccv(linear_model.BayesianRidge(), train, y, metrics.make_scorer(metrics.log_loss))
        self.baseline_stacker_ = linear_model.BayesianRidge()

        for fit_intercept in [False, True]:
            for normalize in [False, True]:
                lr = linear_model.LinearRegression(fit_intercept=fit_intercept,
                                                   normalize=normalize)
                score, _ = self.ccv(lr, train, y, metrics.make_scorer(metrics.log_loss))
                if score < self.baseline_score_:
                    self.baseline_score_ = score
                    self.baseline_stacker_ = lr
        print('baseline:', self.baseline_score_, self.baseline_stacker_)

        np.random.seed(1)
        from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
        space_stack = hp.choice('stacking by',
            [
                dict( type = 'Ridge',
                     random_state = 1, #hp.choice('random_state', range(1, 100000)),
                     alpha = hp.loguniform('alpha', -7, 5),
                     fit_intercept = hp.choice('fit_intercept1', [True, False]),
                     normalize = hp.choice('normalize1', [True, False])
                     ),
            ])

        def get_lr(params):
            t = params['type']
            del params['type']

            if t == 'LinearRegression':
                lr = linear_model.LinearRegression(**params)
            elif t == 'Ridge':
                lr = linear_model.Ridge(**params)
            else:
                raise Exception()

            return lr

        def step(params):
            print(params, end = ' ')
            score, _ = self.ccv(get_lr(params), train, y, metrics.make_scorer(metrics.log_loss))
            print(score, self.now())
            return dict(loss=score, status=STATUS_OK)

        trs = self.load('trials')
        if trs == None:
            tr = Trials()
        else:
            tr, _ = trs
        if len(tr.trials) > 0:
            print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_stack, tr.argmin))
        mt = max(self.hyperopt_rounds_, len(tr.trials) + 1)
        while len(tr.trials) < mt:
            print(len(tr.trials), end=' ')
            best = fmin(step, space_stack, algo=partial(tpe.suggest, n_startup_jobs=1), max_evals=len(tr.trials) + 1, trials = tr)
            self.save('trials', (tr, space_stack))
        params = space_eval(space_stack, best)

        print('best params:', params)
        lr = get_lr(params)
        cv = model_selection.cross_val_score(lr,
                                             train, y,
                                             cv=self.n_fold_,
                                             scoring=metrics.make_scorer(metrics.log_loss))
        if np.mean(cv) > self.baseline_score_:
            lr = self.baseline_stacker_
            cv = model_selection.cross_val_score(lr, train, y, cv=self.n_fold_,
                                                 scoring=metrics.make_scorer(metrics.log_loss))

        lr.fit(train, y)

        v, z = self.v_, self.z_
        z['p'] = np.clip(lr.predict(test), 1e-5, 1-1e-5)
        z['y'] = z['p']
        v['p'] = model_selection.cross_val_predict(lr,
                                             train, y,
                                             cv=10)
        print('cv:', np.mean(cv), np.std(cv))
        return cv, None
Beispiel #60
0
        case, val = args
        if case == "case 1":
            return val
        else:
            return val**2

    from hyperopt import hp

    space = hp.choice(
        "a",
        [
            ("case 1", 1 + hp.lognormal("c1", 0, 1)),
            ("case 2", hp.uniform("c2", -10, 10)),
        ],
    )

    from hyperopt import fmin, tpe, space_eval

    best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

    print(best)
    print(space_eval(space, best))
    print(objective(space_eval(space, best)))


if __name__ == "__main__":
    # info = []
    # test_method(sean_solution, info)
    # test_opt()
    test_file_writer()