def bo_tpe_knn(X, y):
    starttime = datetime.datetime.now()

    def objective(params):
        params = {'n_neighbors': abs(int(params['n_neighbors']))}
        clf = KNeighborsRegressor(**params)
        score = -np.mean(
            cross_val_score(
                clf, X, y, cv=3, n_jobs=-1, scoring="neg_mean_squared_error"))
        return {'loss': score, 'status': STATUS_OK}

    space = {
        'n_neighbors': hp.quniform('n_neighbors', 1, 20, 1),
    }

    trials_knn = Trials()
    best_knn = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=10,
                    trials=trials_knn)
    print("KNN MSE score:%.4f" % min(trials_knn.losses()))
    endtime = datetime.datetime.now()
    process_time_knn = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_knn))
    print("最佳超参数值集合:", best_knn)
    save_model_object(best_knn, 'BO-TPE', 'KNN', 'KNN')
    return min(trials_knn.losses()), process_time_knn, best_knn
    def __call__(self, pipeline_data, context):
        model, hparam_space = context
        self._log.info(hparam_space)
        if not hparam_space:
            self._log.warn((f"Skipping hyperopt step for model {model}. No "
                            "parameter templats found"))
            return HyperparameterSearchResult(model, 0, None)

        trials = Trials()
        self._log.info(f"Running hyperparameter optimization for {model}")

        score = partial(self._score_step_fn, pipeline_data, context)

        fmin(score,
             space=hparam_space,
             algo=tpe.suggest,
             trials=trials,
             max_evals=self._max_evals)

        if self._reverse_score:
            self._log.info(
                "Reversing best score bask to original form as reverse_score=True"
            )
            best_score = 1 - sorted(trials.losses())[0]
        else:
            best_score = sorted(trials.losses())[0]

        best = trials.best_trial['result']['model']
        result = HyperparameterSearchResult(best, best_score, trials)
        return result
def main():

    usage = "%prog"
    parser = OptionParser(usage=usage)
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_chars',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space
    reuse = options.reuse
    output_dirname = options.output_dirname

    if reuse:
        output_dirname += '_reuse'

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename_wo_ext(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=100,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Exemple #4
0
def run_all_dl(
    csvfile=saving_fp,
    space=[
        hp.quniform('h1', 100, 550, 1),
        hp.quniform('h2', 100, 550, 1),
        hp.quniform('h3', 100, 550, 1),
        #hp.choice('activation', ["RectifierWithDropout", "TanhWithDropout"]),
        hp.uniform('hdr1', 0.001, 0.3),
        hp.uniform('hdr2', 0.001, 0.3),
        hp.uniform('hdr3', 0.001, 0.3),
        hp.uniform('rho', 0.9, 0.999),
        hp.uniform('epsilon', 1e-10, 1e-4)
    ]):
    # maxout works well with dropout (Goodfellow et al 2013), and rectifier has worked well with image recognition (LeCun et al 1998)
    start_save(csvfile=csvfile)
    trials = Trials()
    print "Deep learning..."
    best = fmin(objective,
                space=space,
                algo=tpe.suggest,
                max_evals=evals,
                trials=trials)
    print best
    print trials.losses()
    with open('output/dlbest.pkl', 'w') as output:
        pickle.dump(best, output, -1)
    with open('output/dltrials.pkl', 'w') as output:
        pickle.dump(trials, output, -1)
Exemple #5
0
def main():

    usage = "%prog text.json labels.csv feature_dir output_dir"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='max_iter', default=4,
                      help='Maximum iterations of Bayesian optimization; default=%default')

    (options, args) = parser.parse_args()
    max_iter = int(options.max_iter)

    global data_filename, label_filename, feature_dir, output_dir, log_filename

    data_filename = args[0]
    label_filename = args[1]
    feature_dir = args[2]
    output_dir = args[3]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    log_filename = os.path.join(output_dir, 'log.txt')

    with open(log_filename, 'w') as logfile:
        logfile.write(','.join([data_filename, label_filename, feature_dir, output_dir]))

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Exemple #6
0
def run_all_gbm(csvfile=saving_fp,
                space=[
                    hp.quniform('ntrees', 200, 750, 1),
                    hp.quniform('max_depth', 5, 15, 1),
                    hp.uniform('learn_rate', 0.03, 0.35)
                ]):
    # Search space is a stochastic argument-sampling program:
    start_save(csvfile=csvfile)
    trials = Trials()
    best = fmin(objective,
                space=space,
                algo=tpe.suggest,
                max_evals=evals,
                trials=trials)
    print best
    # from hyperopt import space_eval
    # print space_eval(space, best)
    # trials.trials # list of dictionaries representing everything about the search
    # trials.results # list of dictionaries returned by 'objective' during the search
    print trials.losses()  # list of losses (float for each 'ok' trial)
    # trials.statuses() # list of status strings
    with open('output/gbmbest.pkl', 'w') as output:
        pickle.dump(best, output, -1)
    with open('output/gbmtrials.pkl', 'w') as output:
        pickle.dump(trials, output, -1)
def run_all_dl(csvfile = saving_fp, 
                space = [hp.quniform('h1', 100, 550, 1), 
                        hp.quniform('h2', 100, 550, 1),
                        hp.quniform('h3', 100, 550, 1),
                        #hp.choice('activation', ["RectifierWithDropout", "TanhWithDropout"]),
                        hp.uniform('hdr1', 0.001, 0.3),
                        hp.uniform('hdr2', 0.001, 0.3),
                        hp.uniform('hdr3', 0.001, 0.3),
                        hp.uniform('rho', 0.9, 0.999), 
                        hp.uniform('epsilon', 1e-10, 1e-4)]):
          # maxout works well with dropout (Goodfellow et al 2013), and rectifier has worked well with image recognition (LeCun et al 1998)
          start_save(csvfile = csvfile)
          trials = Trials()
          print "Deep learning..."
          best = fmin(objective,
                      space = space,
                      algo=tpe.suggest,
                      max_evals=evals,
                      trials=trials)
          print best
          print trials.losses()
          with open('output/dlbest.pkl', 'w') as output:
            pickle.dump(best, output, -1)
          with open('output/dltrials.pkl', 'w') as output:
            pickle.dump(trials, output, -1)
def bo_tpe_svr(X, y):
    starttime = datetime.datetime.now()

    def objective(params):
        params = {
            'C': abs(float(params['C'])),
            "kernel": str(params['kernel']),
            'epsilon': abs(float(params['epsilon'])),
        }
        clf = SVR(gamma='scale', **params)
        score = -np.mean(
            cross_val_score(
                clf, X, y, cv=3, n_jobs=-1, scoring="neg_mean_squared_error"))

        return {'loss': score, 'status': STATUS_OK}

    space = {
        'C': hp.normal('C', 0, 50),
        "kernel": hp.choice('kernel', ['poly', 'rbf', 'sigmoid']),
        'epsilon': hp.normal('epsilon', 0, 1),
    }

    trials_svr = Trials()
    best_svr = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=20,
                    trials=trials_svr)
    print("SVM MSE score:%.4f" % min(trials_svr.losses()))
    endtime = datetime.datetime.now()
    process_time_svr = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_svr))
    print("最佳超参数值集合:", best_svr)
    save_model_object(best_svr, 'BO-TPE', 'SVR', 'SVR')
    return min(trials_svr.losses()), process_time_svr, best_svr
Exemple #9
0
    def work(self):

        bandit = self.bandit
        assert bandit.name is not None
        print 'Bandit', bandit.name
        algo = TreeParzenEstimator(
            bandit,
            gamma=self.gammas.get(bandit.name, TreeParzenEstimator.gamma),
            prior_weight=self.prior_weights.get(
                bandit.name, TreeParzenEstimator.prior_weight),
            n_EI_candidates=self.n_EIs.get(
                bandit.name, TreeParzenEstimator.n_EI_candidates),
        )
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        exp = Experiment(trials, algo)
        exp.catch_bandit_exceptions = False
        exp.run(LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            exp = Experiment(rtrials, Random(bandit))
            exp.run(LEN)
            print 'RANDOM MINS', list(sorted(rtrials.losses()))[:6]
            #logx = np.log([s['x'] for s in rtrials.specs])
            #print 'RND MEAN', np.mean(logx)
            #print 'RND STD ', np.std(logx)

        print algo.n_EI_candidates
        print algo.gamma
        print algo.prior_weight

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist([t['x'] for t in self.experiment.trials], bins=20)

        #print trials.losses()
        print 'TPE    MINS', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
Exemple #10
0
    def work(self):

        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(
            tpe.suggest,
            gamma=self.gammas.get(bandit.name, tpe._default_gamma),
            prior_weight=self.prior_weights.get(bandit.name,
                                                tpe._default_prior_weight),
            n_EI_candidates=self.n_EIs.get(bandit.name,
                                           tpe._default_n_EI_candidates),
        )
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(
            passthrough,
            space=bandit.expr,
            algo=algo,
            trials=trials,
            max_evals=LEN,
            rstate=np.random.default_rng(np.random.PCG64(0)),
            catch_eval_exceptions=False,
        )
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(
                passthrough,
                space=bandit.expr,
                algo=rand.suggest,
                trials=rtrials,
                max_evals=LEN,
            )
            print("RANDOM MINS", list(sorted(rtrials.losses()))[:6])

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(list(range(LEN)), trials.losses())
            plt.title("TPE losses")
            plt.subplot(2, 2, 2)
            plt.scatter(list(range(LEN)), ([s["x"] for s in trials.specs]))
            plt.title("TPE x")
            plt.subplot(2, 2, 3)
            plt.title("RND losses")
            plt.scatter(list(range(LEN)), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title("RND x")
            plt.scatter(list(range(LEN)), ([s["x"] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist([t["x"] for t in self.experiment.trials], bins=20)

        print("TPE    MINS", list(sorted(trials.losses()))[:6])
        thresh = self.thresholds[bandit.name]
        print("Thresh", thresh)
        assert min(trials.losses()) < thresh
def cli(name, num_trials, verbose):
    """Optimize the hyperparameters for a TMVA BDT. The argument NAME is used
    as the suffix for the TMVA output filename.

    This must be called from within a workspace which contains a configuration
    module named config and an optional directory named macros which contains
    shared libraries of compiled ROOT macros.

    The hyperparameter optimization uses the hyperopt package.
    Bergstra, J., Yamins, D., Cox, D. D. (2013) Making a Science of Model Search:
    Hyperparameter Optimization in Hundreds of Dimensions for Vision Architectures
    """
    logging_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(format='[%(name)s] %(levelname)s - %(message)s',
                        level=logging_level)
    # Set ROOT to batch mode.
    ROOT.gROOT.SetBatch(True)
    # Load macros to the global ROOT instance.
    for macro in os.listdir('macros'):
        _, ext = os.path.splitext(macro)
        if ext == '.so':
            load_status = ROOT.gSystem.Load('macros/{}'.format(macro))
            if load_status < 0:
                raise RuntimeError('Failed to load macro {}'.format(macro))
    # Dynamically load the configuration module.
    config = load_config()
    LOGGER.info('Performing hyperparameter optimization search...')
    with contextlib2.ExitStack() as stack:
        signal_files = [
            stack.enter_context(root_open('sample/{}.root'.format(
                sample.name))) for sample in config.SIGNAL
        ]
        background_files = [
            stack.enter_context(root_open('sample/{}.root'.format(
                sample.name))) for sample in config.BACKGROUND
        ]
        trials = Trials()
        objective = functools.partial(train, name, signal_files,
                                      background_files, config.FEATURES,
                                      config.EVENT_WEIGHT)
        best = fmin(objective,
                    HYPERPARAM_SPACE,
                    algo=tpe.suggest,
                    max_evals=num_trials,
                    trials=trials)
    LOGGER.debug('Trials: %s', trials.trials)
    LOGGER.debug('Trial Results: %s', trials.results)
    LOGGER.debug('Trial Losses: %s', trials.losses())
    LOGGER.debug('Best Trial Hyperparameters: %s', best)
    LOGGER.info('Best Trial Loss: %s', min(trials.losses()))
    for hyperparam, value in best.iteritems():
        if hyperparam in HYPERPARAM_CHOICE_MAP:
            best[hyperparam] = HYPERPARAM_CHOICE_MAP[hyperparam][value]
    del best['TMVA_BDT']
    LOGGER.info(
        'Best Trial TMVA BDT Options: "%s"',
        ':'.join('{0}={1!s}'.format(*param) for param in best.iteritems()))
Exemple #12
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(
            tree.suggest,
            # XXX (begin)
            n_trees=10,
            logprior_strength=1.0,
            # XXX (end)
                )
        LEN = self.LEN.get(bandit.name, 75)

        trials = Trials()
        fmin(fn=passthrough,
            space=self.bandit.expr,
            trials=trials,
            algo=algo,
            max_evals=LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(fn=passthrough,
                space=self.bandit.expr,
                trials=rtrials,
                algo=rand.suggest,
                max_evals=LEN)
            print 'RANDOM BEST 6:', list(sorted(rtrials.losses()))[:6]

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)

        #print trials.losses()
        print 'OPT BEST 6:', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
Exemple #13
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(anneal.suggest)
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(
            fn=passthrough,
            space=self.bandit.expr,
            trials=trials,
            algo=algo,
            max_evals=LEN,
        )
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(
                fn=passthrough,
                space=self.bandit.expr,
                trials=rtrials,
                algo=rand.suggest,
                max_evals=LEN,
            )
            print("RANDOM BEST 6:", list(sorted(rtrials.losses()))[:6])

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(list(range(LEN)), trials.losses())
            plt.title("TPE losses")
            plt.subplot(2, 2, 2)
            plt.scatter(list(range(LEN)), ([s["x"] for s in trials.specs]))
            plt.title("TPE x")
            plt.subplot(2, 2, 3)
            plt.title("RND losses")
            plt.scatter(list(range(LEN)), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title("RND x")
            plt.scatter(list(range(LEN)), ([s["x"] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist([t["x"] for t in self.experiment.trials], bins=20)

        # print trials.losses()
        print("ANNEAL BEST 6:", list(sorted(trials.losses()))[:6])
        # logx = np.log([s['x'] for s in trials.specs])
        # print 'TPE MEAN', np.mean(logx)
        # print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print("Thresh", thresh)
        assert min(trials.losses()) < thresh
def main():
    set_globals()
    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)
    
    print space_eval(space, best)
    print "losses:", [-l for l in trials.losses()]
    print('the best loss: ', max([-l for l in trials.losses()]))
    print("number of trials: " + str(len(trials.trials)))
def bo_tpe_ANN(X, y):
    starttime = datetime.datetime.now()

    def objective(params):
        params = {
            "activation": str(params['activation']),
            "loss": str(params['loss']),
            'batch_size': abs(int(params['batch_size'])),
            'neurons': abs(int(params['neurons'])),
            'epochs': abs(int(params['epochs'])),
            'learning_rate': abs(float(params['learning_rate']))
        }
        clf = KerasRegressor(build_fn=ANN, **params, verbose=verbose)
        score = -np.mean(
            cross_val_score(clf, X, y, cv=3, scoring="neg_mean_squared_error"))

        return {'loss': score, 'status': STATUS_OK}

    space_activation = ['relu', 'tanh']
    space_loss = ['mse', 'mae']
    space = {
        "activation": hp.choice('activation', space_activation),
        "loss": hp.choice('loss', space_loss),
        'batch_size': hp.quniform('batch_size', 32, 128, 32),
        'neurons': hp.quniform('neurons', 256, 1024, 256),
        'epochs': hp.quniform('epochs', 30, 60, 10),
        'learning_rate': hp.uniform('learning_rate', 1e-5, 1e-2)
    }

    trials_ann = Trials()
    best_ann = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=10,
                    trials=trials_ann)
    print("ANN MSE score:%.4f" % min(trials_ann.losses()))
    endtime = datetime.datetime.now()
    process_time_ann = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_ann))
    print("最佳超参数值集合:", best_ann)
    best_params_ann = {
        'activation': space_activation[best_ann['activation']],
        'loss': space_loss[best_ann['loss']],
        'batch_size': int(best_ann['batch_size']),
        'neurons': int(best_ann['neurons']),
        'epochs': int(best_ann['epochs']),
        'learning_rate': float(best_ann['learning_rate'])
    }
    model_bo_tpe_ann = ANN(**best_params_ann)
    save_model_object(model_bo_tpe_ann, 'BO-TPE', 'ANN', 'ANN')
    return min(trials_ann.losses()), process_time_ann, best_ann
def notest_opt_qn_normal(f=hp_normal):
    bandit = Bandit(
        {'loss': scope.sum([f('v%i' % ii, 0, 1) for ii in range(25)])**2},
        loss_target=0)
    algo = TreeParzenEstimator(bandit,
                               prior_weight=.5,
                               n_startup_jobs=0,
                               n_EI_candidates=1,
                               gamma=0.15)
    trials = Trials()
    experiment = Experiment(trials, algo, async=False)
    experiment.max_queue_len = 1
    experiment.run(40)
    print 'sorted losses:', list(sorted(trials.losses()))

    idxs, vals = miscs_to_idxs_vals(trials.miscs)

    if 1:
        import hyperopt.plotting
        hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)
    else:
        import matplotlib.pyplot as plt
        begin = [v[:10] for k, v in vals.items()]
        end = [v[-10:] for k, v in vals.items()]
        plt.subplot(2, 1, 1)
        plt.title('before')
        plt.hist(np.asarray(begin).flatten())
        plt.subplot(2, 1, 2)
        plt.title('after')
        plt.hist(np.asarray(end).flatten())
        plt.show()
Exemple #17
0
def optimize():
    space = {
        'n_estimators': 10000,
        'eta': 0.01,

        # Model complexity
        'max_depth': hp.quniform('max_depth', 1, 15, 1),
        'min_child_weight': hp.quniform('min_child_weight', 1, 100, 1),

        # Robust to noise
        'subsample': hp.quniform('subsample', 0.1, 1.0, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.05),
        'colsample_bylevel': hp.quniform('colsample_bylevel', 0.1, 1.0, 0.05),

        # 'gamma' : hp.uniform('gamma', 0.0, 1000.0),
        # 'alpha' : hp.uniform('alpha', 0.0, 1000.0),
        # 'lambda' : hp.uniform('lambda', 0.001, 1000.0),
        'eval_metric': 'mae',
        'objective': 'reg:linear',
        'silent': 1,
        'seed': 12345
    }
    trials = Trials()
    best_parameters = fmin(fn=score,
                           space=space,
                           algo=tpe.suggest,
                           trials=trials,
                           max_evals=500)
    print("Best parameters:")
    print(best_parameters)
    print("Best result: {0}\n".format(min(trials.losses())))
Exemple #18
0
        def work(self):
            bandit = self.bandit
            random_algo = Random(bandit)
            # build an experiment of 10 trials
            trials = Trials()
            exp = Experiment(trials, random_algo)
            #print random_algo.s_specs_idxs_vals
            exp.run(10)
            ids = trials.tids
            assert len(ids) == 10
            tpe_algo = TreeParzenEstimator(bandit)
            #print pyll.as_apply(tpe_algo.post_idxs)
            #print pyll.as_apply(tpe_algo.post_vals)
            argmemo = {}

            print trials.miscs
            idxs, vals = miscs_to_idxs_vals(trials.miscs)
            argmemo[tpe_algo.observed['idxs']] = idxs
            argmemo[tpe_algo.observed['vals']] = vals
            argmemo[tpe_algo.observed_loss['idxs']] = trials.tids
            argmemo[tpe_algo.observed_loss['vals']] = trials.losses()
            stuff = pyll.rec_eval(
                [tpe_algo.post_below['idxs'], tpe_algo.post_below['vals']],
                memo=argmemo)
            print stuff
Exemple #19
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(anneal.suggest)
        iters_thresholds = self.iters_thresholds.get(bandit.name, 50)

        trials = Trials()
        fmin(
            fn=passthrough,
            space=self.bandit.expr,
            trials=trials,
            algo=algo,
            max_evals=iters_thresholds,
            rstate=np.random.RandomState(8),
        )
        assert len(trials) == iters_thresholds

        rtrials = Trials()
        fmin(
            fn=passthrough,
            space=self.bandit.expr,
            trials=rtrials,
            algo=rand.suggest,
            max_evals=iters_thresholds,
            rstate=np.random.RandomState(8),
        )

        thresh = self.thresholds[bandit.name]
        assert min(trials.losses()) < thresh
Exemple #20
0
def optimize(obj_function, inputs, key_file, space, max_eval):

    trials = Trials()
    f = partial(obj_function, inputs, key_file)
    best = fmin(f, space=space, algo=tpe.suggest, max_evals=max_eval,
                trials=trials)
    LOGGER.info("{}\t{}".format(best, 1 - min(trials.losses())))
Exemple #21
0
 def run(self):
     trials = Trials()
     best = fmin(self._obj, self.model_param_space._build_space(),
                 tpe.suggest, self.max_evals, trials)
     best_params = space_eval(self.model_param_space._build_space(), best)
     best_params = self.model_param_space._convert_into_param(best_params)
     trial_loss = np.asarray(trials.losses(), dtype=float)
     best_ind = np.argmin(trial_loss)
     mrr = -trial_loss[best_ind]
     raw_mrr = trials.trial_attachments(trials.trials[best_ind])["raw_mrr"]
     raw_hits_at1 = trials.trial_attachments(
         trials.trials[best_ind])["raw_hits_at1"]
     raw_hits_at3 = trials.trial_attachments(
         trials.trials[best_ind])["raw_hits_at3"]
     raw_hits_at10 = trials.trial_attachments(
         trials.trials[best_ind])["raw_hits_at10"]
     hits_at1 = trials.trial_attachments(
         trials.trials[best_ind])["hits_at1"]
     hits_at3 = trials.trial_attachments(
         trials.trials[best_ind])["hits_at3"]
     hits_at10 = trials.trial_attachments(
         trials.trials[best_ind])["hits_at10"]
     self.logger.info("-" * 50)
     self.logger.info("Best CV Results:")
     self.logger.info("Raw MRR: %.6f" % raw_mrr)
     self.logger.info("Filtered MRR: %.6f" % mrr)
     self.logger.info("Raw: Hits@1 %.3f Hits@3 %.3f Hits@10 %.3f" %
                      (raw_hits_at1, raw_hits_at3, raw_hits_at10))
     self.logger.info("Filtered: Hits@1 %.3f Hits@3 %.3f Hits@10 %.3f" %
                      (hits_at1, hits_at3, hits_at10))
     self.logger.info("Best Param:")
     self.task._print_param_dict(best_params)
     self.logger.info("-" * 50)
Exemple #22
0
def optimize_model_pytorch(device, args, train_GWAS, train_y, test_GWAS, test_y, out_folder ="", startupJobs = 40, maxevals = 200, noOut = False):
    global numTrials_pytorch
    numTrials_pytorch= 0

    trials = Trials()
    trial_wrapper = partial(trial_pytorch,device = device, args = args , train_GWAS = train_GWAS, train_y = train_y , test_GWAS = test_GWAS , test_y = test_y)

    best_pars = fmin(trial_wrapper, parameter_space_pytorch(), algo=partial(tpe.suggest, n_startup_jobs=(startupJobs) ), max_evals=maxevals, trials=trials)

    # Print the selected 'best' hyperparameters.
    if noOut == False: print('\nBest hyperparameter settings: ',space_eval(parameter_space_pytorch(), best_pars),'\n')

    # loops through the 1st entry in the dict that holds all the lookup keys
    regression = True

    for p in trials.trials[0]['misc']['idxs']: plot_optimization_pytorch(trials, p, regression, out_folder = out_folder) 

    best_pars = space_eval(parameter_space_pytorch(), best_pars) # this turns the indices into the actual params into the valid aprameter space
    
    # override the epochs with the early start
    lowestLossIndex = np.argmin(trials.losses())
    trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] = trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] += 1 # as epochs are 0 based otherwise...
    best_pars['epochs'] = best_pars['earlyStopEpochs'] 
    if best_pars['epochs'] <= 0 : best_pars['epochs'] = 1 # we dont want a network without any training, as that will cause a problem for deep dreaming
    return(best_pars)
Exemple #23
0
        def work(self):
            bandit = self.bandit
            random_algo = Random(bandit)
            # build an experiment of 10 trials
            trials = Trials()
            exp = Experiment(trials, random_algo)
            #print random_algo.s_specs_idxs_vals
            exp.run(10)
            ids = trials.tids
            assert len(ids) == 10
            tpe_algo = TreeParzenEstimator(bandit)
            #print pyll.as_apply(tpe_algo.post_idxs)
            #print pyll.as_apply(tpe_algo.post_vals)
            argmemo = {}

            print trials.miscs
            idxs, vals = miscs_to_idxs_vals(trials.miscs)
            argmemo[tpe_algo.observed['idxs']] = idxs
            argmemo[tpe_algo.observed['vals']] = vals
            argmemo[tpe_algo.observed_loss['idxs']] = trials.tids
            argmemo[tpe_algo.observed_loss['vals']] = trials.losses()
            stuff = pyll.rec_eval([tpe_algo.post_below['idxs'],
                        tpe_algo.post_below['vals']],
                        memo=argmemo)
            print stuff
Exemple #24
0
 def run(self):
     start = time.time()
     trials = Trials()
     best = fmin(self._obj, self.model_param_space._build_space(), tpe.suggest, self.max_evals, trials)
     best_params = space_eval(self.model_param_space._build_space(), best)
     best_params = self.model_param_space._convert_int_param(best_params)
     trial_rmses = np.asarray(trials.losses(), dtype=float)
     best_ind = np.argmin(trial_rmses)
     best_rmse_mean = trial_rmses[best_ind]
     best_rmse_std = trials.trial_attachments(trials.trials[best_ind])["std"]
     self.logger.info("-"*50)
     self.logger.info("Best RMSE")
     self.logger.info("      Mean: %.6f"%best_rmse_mean)
     self.logger.info("      std: %.6f"%best_rmse_std)
     self.logger.info("Best param")
     self.task._print_param_dict(best_params)
     end = time.time()
     _sec = end - start
     _min = int(_sec/60.)
     self.logger.info("Time")
     if _min > 0:
         self.logger.info("      %d mins"%_min)
     else:
         self.logger.info("      %d secs"%_sec)
     self.logger.info("-"*50)
Exemple #25
0
    def optimize_and_return_best(self, X):
        """ Main method to start the optimization process on a dataset. """
        def objective(hparams):
            kfold = KFoldCV(self.ner_model, self.cv, self.eval_split,
                            self.entity_label, self.shuffle_data)
            f1 = kfold.cross_validate(X, hparams)
            log.debug("F-score is {}.".format(f1))
            return {"loss": -f1, 'status': STATUS_OK}

        log.debug("Started tuning model.")
        trials = Trials()
        self.best = fmin(fn=objective,
                         space=self._hparams,
                         algo=tpe.suggest,
                         max_evals=self.max_evals,
                         trials=trials)
        self.f_score_max = -min(trials.losses())
        log.debug("Finished tuning model.")

        log.debug("Started training best model.")
        self.best_estimator = clone(self.ner_model)
        self.best_estimator.fit(X, **self.best)
        log.debug("Finished training best model.")

        log.debug("Best F-score is {}, for the configuration {}.".format(
            self.f_score_max, self.best))
        return self.best_estimator, self.f_score_max
Exemple #26
0
        def optimize_params(model,
                            x,
                            y,
                            space,
                            k=5,
                            max_evals=100,
                            eval_space=False):
            trials = Trials()
            best = fmin(
                partial(objective, model=model, x=x, y=y, k=k),
                space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials,
            )

            param_values = [t["misc"]["vals"] for t in trials.trials]
            param_values = [{
                key: value
                for key in params for value in params[key]
            } for params in param_values]

            if eval_space:
                param_values = [
                    space_eval(space, params) for params in param_values
                ]

            param_df = pd.DataFrame(param_values)
            param_df["accuracy"] = [1 - loss for loss in trials.losses()]
            return space_eval(space, best), param_df
 def run(self):
     start = time.time()
     trials = Trials()
     best = fmin(self._obj, self.model_param_space._build_space(),
                 tpe.suggest, hp_iter, trials)
     best_params = space_eval(self.model_param_space._build_space(), best)
     best_params = self.model_param_space._convert_int_param(best_params)
     trial_rmses = np.asarray(trials.losses(), dtype=float)
     best_ind = np.argmin(trial_rmses)
     best_rmse_mean = trial_rmses[best_ind]
     best_rmse_std = trials.trial_attachments(
         trials.trials[best_ind])["std"]
     self.logger.info("-" * 50)
     self.logger.info("Best RMSE")
     self.logger.info("      Mean: %.6f" % best_rmse_mean)
     self.logger.info("      std: %.6f" % best_rmse_std)
     self.logger.info("Best param")
     self.task._print_param_dict(best_params)
     end = time.time()
     _sec = end - start
     _min = int(_sec / 60.)
     self.logger.info("Time")
     if _min > 0:
         self.logger.info("      %d mins" % _min)
     else:
         self.logger.info("      %d secs" % _sec)
     self.logger.info("-" * 50)
Exemple #28
0
def notest_opt_qn_normal(f=hp_normal):
    bandit = Bandit(
            {'loss': scope.sum([f('v%i' % ii, 0, 1)
                for ii in range(25)]) ** 2},
            loss_target=0)
    algo = TreeParzenEstimator(bandit,
            prior_weight=.5,
            n_startup_jobs=0,
            n_EI_candidates=1,
            gamma=0.15)
    trials = Trials()
    experiment = Experiment(trials, algo, async=False)
    experiment.max_queue_len = 1
    experiment.run(40)
    print 'sorted losses:', list(sorted(trials.losses()))

    idxs, vals = miscs_to_idxs_vals(trials.miscs)

    if 1:
        import hyperopt.plotting
        hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)
    else:
        import matplotlib.pyplot as plt
        begin = [v[:10] for k, v in vals.items()]
        end = [v[-10:] for k, v in vals.items()]
        plt.subplot(2, 1, 1)
        plt.title('before')
        plt.hist(np.asarray(begin).flatten())
        plt.subplot(2, 1, 2)
        plt.title('after')
        plt.hist(np.asarray(end).flatten())
        plt.show()
 def _optimize_by_hyperopt_pkg(self, func_caller, max_capital, options):
   """ Optimizes the function using hyperopt package """
   try:
     from hyperopt import fmin, Trials
   except ImportError:
     raise ImportError('hyperopt package is not installed')
   space = options.space
   algo = options.algo
   param_space = self._get_space_params(space, func_caller.domain.bounds)
   func_to_min = _get_func_to_min_from_func_caller(func_caller)
   trials = Trials()
   best = fmin(func_to_min, space=param_space, algo=algo,
               max_evals=int(max_capital), trials=trials)
   history = Namespace()
   trial_data = trials.trials
   total_num_queries = len(trial_data)
   history.query_step_idxs = [i for i in range(total_num_queries)]
   pts_in_hypopt_format = [trial_data[i]['misc']['vals'].values()
                           for i in range(total_num_queries)]
   history.query_points = [flatten_list_of_lists(pt)
                           for pt in pts_in_hypopt_format]
   history.query_send_times = \
                          [float(trial_data[i]['book_time'].isoformat().split(':')[-1]) \
                           for i in range(total_num_queries)]
   history.query_receive_times = \
                       [float(trial_data[i]['refresh_time'].isoformat().split(':')[-1]) \
                        for i in range(total_num_queries)]
   losses = [-loss for loss in trials.losses()]
   history.query_vals = losses
   history = common_final_operations_for_all_external_packages(history, self.func_caller,
                                                               options)
   return best, history
Exemple #30
0
def hyperparam_tuning(func, search_space, max_evals, algo=tpe.suggest):
    trials = Trials()
    best = fmin(func,
                search_space,
                algo=algo,
                max_evals=max_evals,
                trials=trials)
    print("Best fit:", space_eval(search_space, best))
    trial_loss = np.asarray(trials.losses(), dtype=float)
    best_ind = np.argmin(trial_loss)
    best_loss = trial_loss[best_ind]
    print("Best Loss:", best_loss)

    trial_list = []
    for trial in trials:
        temp_params = dict()
        for key in trial['misc']['vals']:
            temp_params[key] = trial['misc']['vals'][key][0]

        trial_list.append((temp_params, trial['result']['loss']))
    trial_list = sorted(trial_list, key=lambda x: x[1])

    result = {
        "best_params": space_eval(search_space, best),
        "trials": trial_list,
        "best_loss": best_loss
    }
    return result
def opt_method(hsidata, initializers, resdir, max_evals):
    dataset_name = hsidata.dataset_name

    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

    configpath = os.path.join(__location__, 'datasets.cfg')
    parser = ConfigParser()
    parser.read(configpath)
    max_iter = parser.getint(dataset_name, 'max_iter')

    def objective_func(hsidata, hyperpars):

        Y = hsidata.data
        ref_endmembers = hsidata.ref_endmembers
        initializer = hyperpars.pop('initializer')
        init_endmembers = initials[initializer][0]
        init_abundances = initials[initializer][1]

        A, S, J, SAD = lhalf(ref_endmembers, init_endmembers,
                             init_abundances, Y, **hyperpars, verbose=True)

        MSE = mse(Y, A, np.transpose(S))
        S = S.reshape(hsidata.n_rows, hsidata.n_cols, hsidata.n_endmembers).transpose((1, 0, 2))
        results = {'endmembers': A, 'abundances': S, 'loss': J, 'SAD': SAD, 'MSE': MSE}
        loss = SAD[-1] * (1 + np.std(np.sum(S, -1).flatten())) * (1 + np.abs(1 - np.mean(np.sum(S, -1).flatten())))
        return {'loss': loss, 'status': STATUS_OK, 'attachments': results}

    initials = {}
    initial_keys = []
    for key, value in initializers.items():
        initial_keys.append(key)
        initials[key] = (hsidata.initialize(value))

    space = {
        'max_iter': max_iter,
        'q': hp.uniform('lhalf_' + dataset_name + '_q', 0, 1),
        'delta': hp.lognormal('lhalf_' + dataset_name + '_delta', 0, 2),
        'initializer': hp.choice('lhalf_' + dataset_name + '_initializer', initializers)
    }


    h = [hp.lognormal('lhalf_' + dataset_name + '_h' + str(i), 0, 1) for i in range(hsidata.n_endmembers)]

    space['h'] = h

    trials = Trials()

    pars = fmin(lambda x: objective_func(hsidata, x),
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials,
                rstate=np.random.RandomState(random_seed))

    improvements = reduce(improvement_only, trials.losses(), [])

    save_config(resdir, dataset_name, pars, trials.average_best_error())
    print(enumerate(initial_keys))
    return improvements, pars, trials
def bo_tpe_RandomForestRegressor(X, y):

    starttime = datetime.datetime.now()

    # Define the objective function
    def objective(params):
        params = {
            'n_estimators': int(params['n_estimators']),
            'max_depth': int(params['max_depth']),
            'max_features': int(params['max_features']),
            "min_samples_split": int(params['min_samples_split']),
            "min_samples_leaf": int(params['min_samples_leaf']),
            "criterion": str(params['criterion'])
        }
        clf = RandomForestRegressor(**params)
        score = -np.mean(
            cross_val_score(
                clf, X, y, cv=3, n_jobs=-1, scoring="neg_mean_squared_error"))

        return {'loss': score, 'status': STATUS_OK}

    # Define the hyperparameter configuration space
    space = {
        'n_estimators': hp.quniform('n_estimators', 10, 150, 1),
        'max_depth': hp.quniform('max_depth', 5, 50, 1),
        "max_features": hp.quniform('max_features', 1, 13, 1),
        "min_samples_split": hp.quniform('min_samples_split', 2, 11, 1),
        "min_samples_leaf": hp.quniform('min_samples_leaf', 1, 11, 1),
        "criterion": hp.choice('criterion', ['mse', 'mae'])
    }
    trials_rf = Trials()
    best_rf = fmin(fn=objective,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=20,
                   trials=trials_rf)
    print("Random Forest MSE score:%.4f" % min(trials_rf.losses()))
    endtime = datetime.datetime.now()
    process_time_rf = endtime - starttime
    print("程序执行时间(秒):{}".format(process_time_rf))
    print("最佳超参数值集合:", best_rf)
    save_model_object(best_rf, 'BO-TPE', 'RandomForestRegressor',
                      'RandomForestRegressor')
    return min(trials_rf.losses()), process_time_rf, best_rf
Exemple #33
0
def hyperopt_xgboost(X: pd.DataFrame, y: pd.Series, config: Config, max_trials: int=50):
    X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    params = {
        "booster": "gbtree",
        "objective": "reg:linear" if config["mode"] == "regression" else "binary:logistic",
        "eval_metric": "rmse" if config["mode"] == "regression" else "auc",
        "silent": True,
        "seed": 1,
    }

    space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.08), np.log(0.2)),
        "max_depth": hp.choice("max_depth", [2, 3, 4, 5, 6]),
        "max_leaves": hp.choice("max_leaves", np.linspace(10, 200, 50, dtype=int)),
        "subsample": hp.quniform("subsample", 0.5, 1.0, 0.1),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.6, 1.0, 0.1),
        "colsample_bylevel": hp.quniform("colsample_bylevel", 0.6, 1.0, 0.1),
        "reg_alpha": hp.uniform("reg_alpha", 0, 30),
        "reg_lambda": hp.uniform("reg_lambda", 0, 30),
        "min_child_weight": hp.uniform('min_child_weight', 0.5, 10),
    }

    watchlist = [(dvalid, 'valid')]
    def objective(hyperparams):
        model = xgb.train({**params, **hyperparams}, dtrain, 300, watchlist,
                early_stopping_rounds=100, verbose_eval=100)

        score = model.best_score
        if config.is_classification():
            score = -score

        return {'loss': score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials, \
            algo=tpe.suggest, max_evals=max_trials, verbose=1, \
            rstate=np.random.RandomState(1))
    
    best_hyperparams = space_eval(space, best)
    best_loss = trials.best_trial['result']['loss']
    log("{:0.8f} {}".format(best_loss, best_hyperparams))
    
    _history = []
    for i, sc in enumerate(trials.losses()):
        p = space_eval(space, {k: v[i] for k, v in trials.vals.items()})
        _history.append((sc, {**p, **params}))
    history = sorted(_history, key = lambda x: x[0], reverse = False)
    for i, p in history:
        print("score: {0}, paramaters: {1}".format(i, p))
    return history 
def optimize_to_dist(target, n_trials=10, evals=3, seed=0):
    hyp = {}
    hyp['thr'] = hp.uniform('thr', -1, 0)
    hyp['cert'] = hp.uniform('cert', 0, 1)
    hyp['urg'] = hp.uniform('urg', 0, 1)
    # hyp['emo'] = hp.uniform('emo', 0, 1)
    hyp['emo'] = 0
    rng = np.random.RandomState(seed=seed)

    def objective(hyp):
        thr, cert, urg, emo = hyp['thr'], hyp['cert'], hyp['urg'], hyp['emo']
        RTs, corrects = run_trails(thr,
                                   cert,
                                   urg,
                                   emo,
                                   n_trials=n_trials,
                                   plot=False)
        loss = scipy.stats.entropy(RTs, target)
        return {
            'loss': loss,
            'thr': thr,
            'cert': cert,
            'urg': urg,
            'emo': emo,
            'status': STATUS_OK
        }

    trials = Trials()
    fmin(objective,
         rstate=rng,
         space=hyp,
         algo=tpe.suggest,
         max_evals=evals,
         trials=trials)
    best_idx = np.argmin(trials.losses())
    best = trials.trials[best_idx]
    thr = best['result']['thr']
    cert = best['result']['cert']
    urg = best['result']['urg']
    emo = best['result']['emo']
    final_RTs, final_corrects = run_trails(thr,
                                           cert,
                                           urg,
                                           emo,
                                           n_trials=n_trials,
                                           plot=True)
    np.savez("best.npz",
             final_RTs=final_RTs,
             final_corrects=final_corrects,
             thr=thr,
             cert=cert,
             urg=urg,
             emo=emo)
Exemple #35
0
def run_all_gbm(csvfile = saving_fp, 
                space = [hp.quniform('ntrees', 200, 750, 1), hp.quniform('max_depth', 5, 15, 1), hp.uniform('learn_rate', 0.03, 0.35)]):
  # Search space is a stochastic argument-sampling program:
  start_save(csvfile = csvfile)
  trials = Trials()
  best = fmin(objective,
      space = space,
      algo=tpe.suggest,
      max_evals=evals,
      trials=trials)
  print best
  # from hyperopt import space_eval
  # print space_eval(space, best)
  # trials.trials # list of dictionaries representing everything about the search
  # trials.results # list of dictionaries returned by 'objective' during the search
  print trials.losses() # list of losses (float for each 'ok' trial)
  # trials.statuses() # list of status strings
  with open('output/gbmbest.pkl', 'w') as output:
    pickle.dump(best, output, -1)
  with open('output/gbmtrials.pkl', 'w') as output:
    pickle.dump(trials, output, -1)
Exemple #36
0
def make_opt_predict_by_models(specified_models):
    """
    使用指定的模型预测结果
    所有尝试的参数均记录在文件中
    :param specified_models:
    :return:best_kappa_mean, best_kappa_std
    """
    log_path = "%s/Log" % config.output_path
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    models_best_params = []
    # 判断传入参数中的models是不是已经配置的models
    for feat_name in specified_models:
        if feat_name not in model_library_config.feat_names:
            continue
        # param space ,模型内部也需要(打日志头部)
        feat_folder, param_space = model_library_config.model_config[feat_name]
        model = create_model(param_space, config.solution_info, feat_folder,
                             feat_name)
        model.log_header()

        print("************************************************************")
        print("Search for the best params")
        # global trial_counter
        trials = Trials()
        objective = lambda p: model.hyperopt_obj(p, feat_folder, feat_name)
        best_params = fmin(objective,
                           param_space,
                           algo=tpe.suggest,
                           trials=trials,
                           max_evals=param_space["max_evals"])
        # 把best_params包含的数字属性转成int
        for f in model_library_config.int_feat:
            if best_params.has_key(f):
                best_params[f] = int(best_params[f])
        print("************************************************************")
        print("Best params")
        for k, v in best_params.items():
            print("        %s: %s" % (k, v))
        # 获取尝试的losses
        trial_kappas = -np.asarray(trials.losses(), dtype=float)
        best_kappa_mean = max(trial_kappas)
        # where返回两个维度的坐标
        ind = np.where(trial_kappas == best_kappa_mean)[0][0]
        # 找到最优参数的std
        best_kappa_std = trials.trial_attachments(trials.trials[ind])['std']
        print("Kappa stats")
        print("Mean: %.6f\n        Std: %.6f" %
              (best_kappa_mean, best_kappa_std))

        models_best_params.append((feat_name, best_kappa_mean, best_kappa_std))

    return models_best_params
Exemple #37
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(anneal.suggest)
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(fn=passthrough, space=self.bandit.expr, trials=trials, algo=algo, max_evals=LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(fn=passthrough, space=self.bandit.expr, trials=rtrials, algo=rand.suggest, max_evals=LEN)
            print("RANDOM BEST 6:", list(sorted(rtrials.losses()))[:6])

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(list(range(LEN)), trials.losses())
            plt.title("TPE losses")
            plt.subplot(2, 2, 2)
            plt.scatter(list(range(LEN)), ([s["x"] for s in trials.specs]))
            plt.title("TPE x")
            plt.subplot(2, 2, 3)
            plt.title("RND losses")
            plt.scatter(list(range(LEN)), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title("RND x")
            plt.scatter(list(range(LEN)), ([s["x"] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist([t["x"] for t in self.experiment.trials], bins=20)

        # print trials.losses()
        print("ANNEAL BEST 6:", list(sorted(trials.losses()))[:6])
        # logx = np.log([s['x'] for s in trials.specs])
        # print 'TPE MEAN', np.mean(logx)
        # print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print("Thresh", thresh)
        assert min(trials.losses()) < thresh
def hyperparam_tuning(func, search_space, max_evals, algo=tpe.suggest):
    trials = Trials()
    best = fmin(func,
                search_space,
                algo=algo,
                max_evals=max_evals,
                trials=trials)
    print("Best fit:", space_eval(search_space, best))
    trial_loss = np.asarray(trials.losses(), dtype=float)
    best_ind = np.argmin(trial_loss)
    best_loss = trial_loss[best_ind]
    print("Best Loss:", best_loss)
    return space_eval(search_space, best), trials, best_loss
Exemple #39
0
    def _bayes_hyper_tune(self, space: dict, max_evals: int = 3) -> float:
        trials = Trials()

        hp_assignments = fmin(fn=self._evaluate_model,
                              space=space,
                              trials=trials,
                              algo=tpe.suggest,
                              verbose=False,
                              max_evals=max_evals)

        self.best_parameters = space_eval(space, hp_assignments)

        return min(trials.losses())
Exemple #40
0
    def tune_model(
        self,
        ds_x,
        ds_y,
        folds,
        eval_rounds=100,
        groups=None,
        trials=None,
        mon_cons=None,
        categorical=None,
    ):
        """
        Main function responsible for tuning hyperparameters

        :param ds_x: pandas.Dataframe or numpy.array
            Training data
        :param ds_y: pandas.Dataframe or numpy.array
            Training label
        :param folds: sklearn.model_selection or sklearn.cross_validation object
            fkolds for crossvaliation of hyperparameters
        :param eval_rounds: int
            number of iterations to run the hyperparameter turning
        :param groups numpy.array
            index of groups used for GroupKFold
        :param trials: hyperopt.Trials object
            pretuned hyperopt trials object if available
        :param mon_cons: str(tuple) for xgboost, tuple for lightgbm
            index of monotonic constraints
        :param categorical: list
            index of categorical feature for lightgbm
        :return: parameters: dict
            the best hyperparameters
        """
        # Create hyperopt Trials object
        if trials is None:
            trials = Trials()
            additional_evals = eval_rounds
        else:
            additional_evals = len(trials.losses()) + eval_rounds

        # Create the loss function
        loss_func = self.create_loss_func(ds_x, ds_y, folds, groups)

        # Find optimal hyperparameters
        parameters = self.optimize(trials, loss_func, additional_evals,
                                   mon_cons, categorical)

        self.params = parameters
        self.trials = trials

        return parameters
Exemple #41
0
    def run(self):
        line_index = 1
        self.param_space = ModelParamSpace()
        for task_mode in learner_space.keys():
            if task_mode not in learner_space:
                print('%s model missed' % task_mode)
                continue
            print('start %s model task' % task_mode)
            for learner in learner_space[task_mode]:
                print('optimizing %s' % learner)
                self.leaner_name = learner
                start = time.time()
                trials = Trials()
                logname = "%s_%s_%s.log" % (
                    task_mode, learner,
                    datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))
                self.logger = logging_utils._get_logger(
                    config.LOG_DIR, logname)
                best = fmin(lambda param: self._obj(param, task_mode),
                            self.param_space._build_space(learner),
                            tpe.suggest, self.max_evals, trials)

                end = time.time()
                time_cost = time_utils.time_diff(start, end)
                self.logger.info("Hyperopt_Time")
                self.logger.info("     %s" % time_cost)
                self.logger.info("-" * 50)
                print("   Finished %d hyper train with %d-fold cv, took %s" %
                      (self.max_evals, self.n_iter, time_cost))

                best_params = space_eval(
                    self.param_space._build_space(learner), best)
                best_params = self.param_space._convert_int_param(best_params)
                trial_loss = np.asarray(trials.losses(), dtype=float)
                best_ind = np.argmin(trial_loss)
                auc_cv_mean = -trial_loss[best_ind]
                test_auc = trials.trial_attachments(
                    trials.trials[best_ind])["test_auc"]
                refit_time = trials.trial_attachments(
                    trials.trials[best_ind])["refit_time"]

                with open(config.MODEL_COMPARE, 'a+') as f:
                    if line_index:
                        line_index = 0
                        f.writelines(
                            "task_mode   learner   auc_cv_mean   test_auc   refit_time   best_params \n"
                        )
                    f.writelines("%s   %s   %.4f   %.4f   %s   %s \n" %
                                 (task_mode, learner, auc_cv_mean, test_auc,
                                  refit_time, best_params))
                f.close()
    def hyperopt_search(self, parallel=False):  # TODO: implement parallel search with MongoTrials
        def objective(kwargs):
            start = dt.now()
            self.get_hyperparam_string(**kwargs)
            self.fit_vw()
            self.validate_vw()
            loss = self.validation_metric_vw()

            finish = dt.now()
            elapsed = finish - start
            self.logger.info("evaluation time for this step: %s" % str(elapsed))

            # clean up
            subprocess.call(shlex.split('rm %s %s' % (self.train_model, self.holdout_pred)))

            to_return = {'status': STATUS_OK,
                         'loss': loss,  # TODO: include also train loss tracking in order to prevent overfitting
                         'eval_time': elapsed,
                         'train_command': self.train_command
                        }
            return to_return

        trials = Trials()
        if self.searcher == 'tpe':
            algo = tpe.suggest
        elif self.searcher == 'rand':
            algo = rand.suggest

        logging.debug("starting hypersearch...")
        best_params = fmin(objective, space=self.space, trials=trials, algo=algo, max_evals=self.max_evals)
        self.logger.debug("the best hyperopt parameters: %s" % str(best_params))

        best_configuration = trials.results[np.argmin(trials.losses())]['train_command']
        best_loss = trials.results[np.argmin(trials.losses())]['loss']
        self.logger.info("\n\nA FULL TRAINING COMMAND WITH THE BEST HYPERPARAMETERS: \n%s" % best_configuration)
        self.logger.info("\n\nTHE BEST LOSS VALUE: \n%s" % best_loss)

        return best_configuration, best_loss
Exemple #43
0
def main():
    usage = "%prog train_text.json train_labels.csv dev_text.json dev_labels.csv output_dir"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-m',
        dest='max_iter',
        default=50,
        help='Maximum iterations of Bayesian optimization; default=%default')

    (options, args) = parser.parse_args()
    max_iter = int(options.max_iter)

    global train_data_filename, train_label_filename, dev_data_filename, dev_label_filename
    global output_dir, train_feature_dir, dev_feature_dir, model_dir, log_filename, trial_num

    train_data_filename = args[0]
    train_label_filename = args[1]
    dev_data_filename = args[2]
    dev_label_filename = args[3]
    output_dir = args[4]

    train_feature_dir = output_dir + '/train_features/'
    dev_feature_dir = output_dir + '/dev_features/'
    model_dir = output_dir + '/saved_models/'

    trial_num = 0

    for directory in [
            output_dir, train_feature_dir, dev_feature_dir, model_dir
    ]:
        if not os.path.exists(directory):
            os.makedirs(directory)
    log_filename = os.path.join(output_dir, 'log.txt')

    with open(log_filename, 'w') as logfile:
        logfile.write(','.join([
            train_data_filename, train_label_filename, dev_data_filename,
            dev_label_filename, train_feature_dir, dev_feature_dir, output_dir
        ]) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)

    print space_eval(space, best)
    print "losses:", [-l for l in trials.losses()]
    print("number of trials: " + str(len(trials.trials)))
def TunningParamter(param,data,features,feature,source_name,real_value,int_boolean):
    data = data[~pd.isnull(all_data[feature])]
    print data.shape
    ISOTIMEFORMAT='%Y-%m-%d %X'
    start = time.strftime(ISOTIMEFORMAT, time.localtime())
    trials = Trials()
    objective = lambda p : trainModel(p, data, features, feature,source_name,real_value,int_boolean)
    
    best_parameters = fmin(objective, param, algo =tpe.suggest,max_evals=param['max_evals'],trials= trials)
    #now we need to get best_param
    trials_loss = np.asanyarray(trials.losses(),dtype=float)
    best_loss = min(trials_loss)
    ind = np.where(trials_loss==best_loss)[0][0]
    best_loss_std = trials.trial_attachments(trials.trials[ind])['std']
    end = time.strftime(ISOTIMEFORMAT,time.localtime())
    dumpMessage(best_parameters, best_loss, best_loss_std,param['task'],source_name,start,end)
Exemple #45
0
def main():

    usage = "%prog <DRLD|MIP|MOLD|Primary|General|Terrorist|PK-Brown|PK-Roberts|PK-Pelosi|PK-Cheney>"
    parser = OptionParser(usage=usage)
    parser.add_option("-m", dest="model", default="LR", help="Model: (LR|SVM|MNB|SVMNB); default=%default")
    parser.add_option("-t", dest="test_fold", default=0, help="Test fold; default=%default")
    parser.add_option("-o", dest="output_dirname", default="bayes_opt", help="Output directory name")
    parser.add_option(
        "--reuse", dest="reuse", action="store_true", default=False, help="Use reusable holdout; default=%default"
    )
    parser.add_option(
        "--alpha",
        dest="alpha",
        action="store_true",
        default=False,
        help="Include alpha in search space (instead of grid search); default=%default",
    )
    parser.add_option(
        "--n_dev_folds",
        dest="n_dev_folds",
        default=5,
        help="Number of dev folds to use when tuning/evaluating; default=%default",
    )

    # parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space, run, group, test_fold, n_dev_folds

    run = args[0]
    reuse = options.reuse
    search_alpha = options.alpha
    # n_codes = int(options.n_codes)
    output_dirname = options.output_dirname
    model = options.model
    test_fold = int(options.test_fold)
    n_dev_folds = int(options.n_dev_folds)

    # allow user to specfiy a particular choice of model
    if model == "LR":
        space["model"] = {
            "model": "LR",
            #'regularization': hp.choice('regularization', ['l1', 'l2'])
            "regularization": "l1",
        }
    elif model == "SVM":
        space["model"] = {
            "model": "SVM",
            "kernel": hp.choice(
                "ktype",
                [{"ktype": "linear"}, {"ktype": "poly", "degree": hp.choice("degree", [2, 3, 4])}, {"ktype": "rbf"}],
            ),
        }
    elif model == "MNB":
        space["model"] = {"model": "MNB"}
    elif model == "SVMNB":
        space["model"] = {"model": "SVMNB", "beta": hp.uniform("beta", 0, 1)}
    else:
        sys.exit("Choice of model not supported!")

    if run == "DRLD":
        add_drld()
        group = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"]
        n_codes = 33
    elif run == "MIP":
        add_MIP()
        group = ["MIP-Personal-1", "MIP-Personal-2", "MIP-Political-1", "MIP-Political-2"]
        n_codes = 74
    elif run == "MOLD":
        add_MOLD()
        group = ["McCain-Likes", "McCain-Dislikes", "Obama-Likes", "Obama-Dislikes"]
        n_codes = 34
    elif run == "Primary":
        add_obama()
        add_clinton()
        group = ["Obama-Primary", "Clinton-Primary"]
        n_codes = 42
    elif run == "General":
        add_obama()
        add_mccain()
        group = ["Obama-General", "McCain-General"]
        n_codes = 41
    elif run == "Terrorists":
        group = [run]
        n_codes = 28
    elif run == "PK-Brown":
        group = [run]
        n_codes = 14
    elif run == "PK-Cheney":
        group = [run]
        n_codes = 12
    elif run == "PK-Pelosi":
        group = [run]
        n_codes = 15
    elif run == "PK-Roberts":
        group = [run]
        n_codes = 14
    else:
        sys.exit("Dataset not recognized")

    output_dirname += "_" + model

    if search_alpha:
        space["alphas"] = []
        for i in range(n_codes):
            space["alphas"].append(hp.loguniform("alpha" + str(i), -1.15, 9.2))
        output_dirname += "_alphas"

    if reuse:
        output_dirname += "_reuse"
    else:
        output_dirname += "_noreuse"
    output_dirname += "_" + run

    if n_dev_folds != 5:
        output_dirname += "_" + str(n_dev_folds)

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), "log")

    with codecs.open(output_filename, "w") as output_file:
        output_file.write(output_dirname + "\n")
        output_file.write("reuse = " + str(reuse) + "\n")
        output_file.write("search alphas = " + str(search_alpha) + "\n")

    trials = Trials()
    best = fmin(call_experiment, space=space, algo=tpe.suggest, max_evals=40, trials=trials)

    print space_eval(space, best)
    print trials.losses()
Exemple #46
0
    #name = model + '_' + regularizer + '_' + '_'.join(feature_list) + '_' + str(hyperparams[0])
    name = model + '_' + regularizer + '_' + '_'.join(feature_list)
    datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes']
    print name
    return experiment.run_group_experiment(name, datasets, 0, feature_list,
                                               model_type=model, regularizer=regularizer)


trials = Trials()
best = fmin(call_experiment,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

#rseed=np.random.randint(1, 4294967295)
#print best
print space_eval(space, best)
print trials.losses()
#for trial in trials.trials:
#    print trial



#run_group_experiment('profile_test', ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'],
#                                    0, ['ngrams'], model_type='SVM')



#experiment_new.main()
Exemple #47
0
def main():

    usage = "%prog project label_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='LR',
                      help='Model: (LR|SVM|MNB|SVMNB); default=%default')
    parser.add_option('-f', dest='test_fold', default=0,
                      help='Test fold; default=%default')
    parser.add_option('-o', dest='output_prefix', default='bayes_opt',
                      help='Output prefix')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')
    parser.add_option('--alpha', dest='alpha', action="store_true", default=False,
                      help='Include alpha in search space (instead of grid search); default=%default')
    parser.add_option('--n_dev_folds', dest='n_dev_folds', default=None,
                      help='Number of dev folds to use when tuning/evaluating; default=%default')
    parser.add_option('-t', dest='target_col', default=2,
                      help='Index of column containing labels; default=%default')
    parser.add_option('-w', dest='weight_col', default=-1,
                      help='Index of column containing weights (-1 for None); default=%default')
    parser.add_option('-v', dest='verbose', default=1,
                      help='Level of verbosity; default=%default')
    parser.add_option('-n', dest='n_iter', default=60,
                      help='Number of iterations; default=%default')
    parser.add_option('-a', dest='add_pseudo', action="store_true", default=False,
                      help='Make use of pseudo-documents; default=%default')
    parser.add_option('--random', dest='random_search', action="store_true", default=False,
                      help='Use random search instead of TPE; default=%default')
    parser.add_option('--only_unanimous', dest='only_unanimous', action="store_true", default=False,
                      help='Use only the articles with unanimous agreement for evaluation; default=%default')



    #parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space, label_file, group, test_fold, n_dev_folds
    global weight_col, verbose, target, add_pseudo, only_unanimous

    project = args[0]
    dirs.make_base_dir(project)

    label_file = args[1]
    reuse = options.reuse
    search_alpha = options.alpha
    #n_codes = int(options.n_codes)
    output_prefix = options.output_prefix
    model = options.model
    test_fold = int(options.test_fold)
    if options.n_dev_folds is not None:
        n_dev_folds = int(options.n_dev_folds)
    else:
        n_dev_folds = None
    weight_col = int(options.weight_col)
    target = int(options.target_col)
    verbose = int(options.verbose)
    n_iter = int(options.n_iter)
    add_pseudo = options.add_pseudo
    random_search = options.random_search
    only_unanimous = options.only_unanimous

    # allow user to specfiy a particular choice of model
    if model == 'LR':
        space['model'] = {
            'model': 'LR',
            'regularization': hp.choice('regularization', ['l1', 'l2'])
            #'regularization': 'l1'
        }
    elif model == 'SVM':
        space['model'] = {
            'model': 'SVM',
            'kernel': hp.choice('ktype', [
                {'ktype': 'linear'},
                {'ktype': 'poly', 'degree': hp.choice('degree', [2, 3, 4])},
                {'ktype': 'rbf'}
            ]
                                )
        }
    elif model == 'MNB':
        space['model'] = {
            'model': 'MNB'
        }
    elif model == 'SVMNB':
        space['model'] = {
            'model': 'SVMNB',
            'beta': hp.uniform('beta', 0, 1)
        }
    else:
        sys.exit('Choice of model not supported!')

    if add_pseudo:
        add_pseudo_options()

    output_prefix += '_' + model

    if search_alpha:
        space['alpha'] = hp.loguniform('alpha', -3, 10)
        output_prefix += '_alpha'
    else:
        output_prefix += '_noalpha'

    if reuse:
        output_prefix += '_reuse'

    all_items, target_name, labels, weights, _ = lr.get_labels(label_file, target, weight_col=weight_col)
    output_dirname = experiment.make_exp_dir(test_fold, target_name, output_prefix)
    basedir = os.path.split(output_dirname)[0]
    output_filename = fh.make_filename(basedir, output_prefix, 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        output_file.write('reuse = ' + str(reuse) + '\n')
        output_file.write('search alphas = ' + str(search_alpha) + '\n')

    trials = Trials()

    if random_search:
        best = fmin(call_experiment,
                    space=space,
                    algo=rand.suggest,
                    max_evals=n_iter,
                    trials=trials)
    else:
        best = fmin(call_experiment,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=n_iter,
                    trials=trials)

    print space_eval(space, best)
    print trials.losses()
 train = pd.read_csv("../data/train.process.csv")
 for feat_name,feat_fold in zip(feat_names,feat_folders):
     #at first we need to read to for our model 
     #this is for reduce time to read data
     print 'read data for trainning'
     print 'generate model in condition in %s'%(feat_name)
     print "Search for the best models"
     print "fea_name %s"%(feat_name)
     #for reduce the time for read data
     #the train.shape[0]=39774
     ISOTIMEFORMAT='%Y-%m-%d %X'
     start_time = time.strftime( ISOTIMEFORMAT, time.localtime() )
     param_space = para_spaces[feat_name]
     trials = Trials()
     objective = lambda p : trainModel(p, feat_fold, feat_name)
     best_params = fmin(objective,param_space,algo=tpe.suggest,
                       trials=trials, max_evals=param_space["max_evals"])
     print type(best_params)
     print best_params
     for f in int_feat:
         if best_params.has_key(f):
             best_params[f] = int(best_params[f])
     trial_acc = -np.asanyarray(trials.losses(), dtype=float )
     best_acc_mean = max(trial_acc)
     ind = np.where(trial_acc==best_acc_mean)[0][0]
     best_acc_std = trials.trial_attachments(trials.trials[ind])['std']
     end_time = time.strftime( ISOTIMEFORMAT, time.localtime() )
     dumpModelMessage(best_params, best_acc_mean, best_acc_std, feat_fold,feat_name,start_time,end_time)
     print ("Best stats")
     print ('Mean:%.6f \nStd:%.6f \n'%(best_acc_mean,best_acc_std))
     
    def work(self):
        np.random.seed(1234)
        bandit = self.bandit
        LEN = self.LEN.get(bandit.name, 100)
        thresh = self.thresholds[bandit.name]

        print 'STARTING TEST', bandit.name
        rtrials = Trials()
        fmin(fn=passthrough,
            space=self.bandit.expr,
            trials=rtrials,
            algo=rand.suggest,
            max_evals=LEN,
            rstate=np.random)
        print 'RANDOM BEST 6:', list(sorted(rtrials.losses()))[:6]

        if bandit.name != 'n_arms':
            # -- assert that our threshold is meaningful
            assert min(rtrials.losses()) > thresh

        assert bandit.name is not None
        algo = partial(
            suggest_algos.ei,
            stop_at=self.thresholds[bandit.name])

        trials = Trials()
        fmin(fn=passthrough,
            space=self.bandit.expr,
            trials=trials,
            algo=algo,
            max_evals=LEN,
            rstate=np.random)
        assert len(trials) <= LEN


        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)


        #print trials.losses()
        print 'SUGGEST BEST 6:', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
Exemple #50
0
    def work(self):

        bandit = self.bandit
        assert bandit.name is not None
        print 'Bandit', bandit.name
        algo = TreeParzenEstimator(bandit,
                gamma=self.gammas.get(bandit.name,
                    TreeParzenEstimator.gamma),
                prior_weight=self.prior_weights.get(bandit.name,
                    TreeParzenEstimator.prior_weight),
                n_EI_candidates=self.n_EIs.get(bandit.name,
                    TreeParzenEstimator.n_EI_candidates),
                )
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        exp = Experiment(trials, algo)
        exp.catch_bandit_exceptions = False
        exp.run(LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            exp = Experiment(rtrials, Random(bandit))
            exp.run(LEN)
            print 'RANDOM MINS', list(sorted(rtrials.losses()))[:6]
            #logx = np.log([s['x'] for s in rtrials.specs])
            #print 'RND MEAN', np.mean(logx)
            #print 'RND STD ', np.std(logx)

        print algo.n_EI_candidates
        print algo.gamma
        print algo.prior_weight

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)

        #print trials.losses()
        print 'TPE    MINS', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
Exemple #51
0
    def work(self):

        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(tpe.suggest,
                gamma=self.gammas.get(bandit.name,
                    tpe._default_gamma),
                prior_weight=self.prior_weights.get(bandit.name,
                    tpe._default_prior_weight),
                n_EI_candidates=self.n_EIs.get(bandit.name,
                    tpe._default_n_EI_candidates),
                )
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(passthrough,
            space=bandit.expr,
            algo=algo,
            trials=trials,
            max_evals=LEN,
            rstate=np.random.RandomState(123),
            catch_eval_exceptions=False)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(passthrough,
                space=bandit.expr,
                algo=rand.suggest,
                trials=rtrials,
                max_evals=LEN)
            print 'RANDOM MINS', list(sorted(rtrials.losses()))[:6]
            #logx = np.log([s['x'] for s in rtrials.specs])
            #print 'RND MEAN', np.mean(logx)
            #print 'RND STD ', np.std(logx)

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)

        #print trials.losses()
        print 'TPE    MINS', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
        log_handler = open(log_file, 'wb' )
        writer = csv.writer( log_handler )
        headers = ['trial_counter', 'kappa_mean', 'kappa_std' ]
        for k,v in sorted(param_space.items()):
            headers.append(k)
        writer.writerow( headers )
        log_handler.flush()
        
        print("************************************************************")
        print("Search for the best params")
        #global trial_counter
        trial_counter = 0
        trials = Trials()
        objective = lambda p: hyperopt_wrapper(p,feat_name)
        best_params = fmin(objective, param_space, algo=tpe.suggest,
                           trials=trials, max_evals=param_space["max_evals"])
        for f in int_feat:
            if best_params.has_key(f):
                best_params[f] = int(best_params[f])
        print("************************************************************")
        print("Best params")
        for k,v in best_params.items():
            print "        %s: %s" % (k,v)
        trial_kappas = -np.asarray(trials.losses(), dtype=float)
        best_kappa_mean = max(trial_kappas)
        ind = np.where(trial_kappas == best_kappa_mean)[0][0]
        best_kappa_std = trials.trial_attachments(trials.trials[ind])['std']
        print("Kappa stats")
        print("        Mean: %.6f\n        Std: %.6f" % (best_kappa_mean, best_kappa_std))
    
class HyperOptimizer(object):
    def __init__(self, train_set, holdout_set, command, max_evals=100,
                 outer_loss_function='logistic',
                 searcher='tpe', is_regression=False):
        self.train_set = train_set
        self.holdout_set = holdout_set

        self.train_model = './current.model'
        self.holdout_pred = './holdout.pred'
        self.trials_output = './trials.json'
        self.hyperopt_progress_plot = './hyperopt_progress.png'
        self.log = './log.log'

        self.logger = self._configure_logger()

        # hyperopt parameter sample, converted into a string with flags
        self.param_suffix = None
        self.train_command = None
        self.validate_command = None

        self.y_true_train = []
        self.y_true_holdout = []

        self.outer_loss_function = outer_loss_function
        self.space = self._get_space(command)
        self.max_evals = max_evals
        self.searcher = searcher
        self.is_regression = is_regression

        self.trials = Trials()
        self.current_trial = 0

    def _get_space(self, command):
        hs = HyperoptSpaceConstructor(command)
        hs.string_to_pyll()
        return hs.space

    def _configure_logger(self):
        LOGGER_FORMAT = "%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s/%(module)s:%(lineno)d]: %(message)s"
        LOGGER_DATEFMT = "%Y-%m-%d %H:%M:%S"
        LOGFILE = self.log

        logging.basicConfig(format=LOGGER_FORMAT,
                            datefmt=LOGGER_DATEFMT,
                            level=logging.DEBUG)
        formatter = logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)

        file_handler = logging.FileHandler(LOGFILE)
        file_handler.setFormatter(formatter)

        logger = logging.getLogger()
        logger.addHandler(file_handler)
        return logger

    def get_hyperparam_string(self, **kwargs):
        for arg in ['--passes']: #, '--rank', '--lrq']:
            if arg in kwargs:
                kwargs[arg] = int(kwargs[arg])

        #print 'KWARGS: ', kwargs
        flags = [key for key in kwargs if key.startswith('-')]
        for flag in flags:
            if kwargs[flag] == 'omit':
                del kwargs[flag]

        self.param_suffix = ' '.join(['%s %s' % (key, kwargs[key]) for key in kwargs if key.startswith('-')])
        self.param_suffix += ' %s' % (kwargs['argument'])

    def compose_vw_train_command(self):
        data_part = ('vw -d %s -f %s --holdout_off -c '
                     % (self.train_set, self.train_model))
        self.train_command = ' '.join([data_part, self.param_suffix])

    def compose_vw_validate_command(self):
        data_part = 'vw -t -d %s -i %s -p %s --holdout_off -c' \
                    % (self.holdout_set, self.train_model, self.holdout_pred)
        self.validate_command = data_part

    def fit_vw(self):
        self.compose_vw_train_command()
        self.logger.info("executing the following command (training): %s" % self.train_command)
        subprocess.call(shlex.split(self.train_command))

    def validate_vw(self):
        self.compose_vw_validate_command()
        self.logger.info("executing the following command (validation): %s" % self.validate_command)
        subprocess.call(shlex.split(self.validate_command))

    def get_y_true_train(self):
        self.logger.info("loading true train class labels...")
        yh = open(self.train_set, 'r')
        self.y_true_train = []
        for line in yh:
            self.y_true_train.append(int(line.strip()[0:2]))
        if not self.is_regression:
            self.y_true_train = [(i + 1.) / 2 for i in self.y_true_train]
        self.logger.info("train length: %d" % len(self.y_true_train))

    def get_y_true_holdout(self):
        self.logger.info("loading true holdout class labels...")
        yh = open(self.holdout_set, 'r')
        self.y_true_holdout = []
        for line in yh:
            self.y_true_holdout.append(int(line.strip()[0:2]))
        if not self.is_regression:
            self.y_true_holdout = [(i + 1.) / 2 for i in self.y_true_holdout]
        self.logger.info("holdout length: %d" % len(self.y_true_holdout))

    def validation_metric_vw(self):
        v = open('%s' % self.holdout_pred, 'r')
        y_pred_holdout = []
        for line in v:
            y_pred_holdout.append(float(line.strip()))

        if self.outer_loss_function == 'logistic':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':  # TODO: write it
            pass

        elif self.outer_loss_function == 'hinge':  # TODO: write it
            pass

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss

    def hyperopt_search(self, parallel=False):  # TODO: implement parallel search with MongoTrials
        def objective(kwargs):
            start = dt.now()

            self.current_trial += 1
            self.logger.info('\n\nStarting trial no.%d' % self.current_trial)
            self.get_hyperparam_string(**kwargs)
            self.fit_vw()
            self.validate_vw()
            loss = self.validation_metric_vw()

            finish = dt.now()
            elapsed = finish - start
            self.logger.info("evaluation time for this step: %s" % str(elapsed))

            # clean up
            subprocess.call(shlex.split('rm %s %s' % (self.train_model, self.holdout_pred)))

            to_return = {'status': STATUS_OK,
                         'loss': loss,  # TODO: include also train loss tracking in order to prevent overfitting
                         'eval_time': elapsed.seconds,
                         'train_command': self.train_command,
                         'current_trial': self.current_trial
            }
            return to_return

        self.trials = Trials()
        if self.searcher == 'tpe':
            algo = tpe.suggest
        elif self.searcher == 'rand':
            algo = rand.suggest

        logging.debug("starting hypersearch...")
        best_params = fmin(objective, space=self.space, trials=self.trials, algo=algo, max_evals=self.max_evals)
        self.logger.debug("the best hyperopt parameters: %s" % str(best_params))

        json.dump(self.trials.results, open(self.trials_output, 'w'))
        self.logger.info('All the trials results are saved at %s' % self.trials_output)

        best_configuration = self.trials.results[np.argmin(self.trials.losses())]['train_command']
        best_loss = self.trials.results[np.argmin(self.trials.losses())]['loss']
        self.logger.info("\n\nA full training command with the best hyperparameters: \n%s\n\n" % best_configuration)
        self.logger.info("\n\nThe best holdout loss value: \n%s\n\n" % best_loss)

        return best_configuration, best_loss

    def plot_progress(self):
        try:
            sns.set_palette('Set2')
            sns.set_style("darkgrid", {"axes.facecolor": ".95"})
        except:
            pass

        self.logger.debug('plotting...')
        plt.figure(figsize=(15,10))
        plt.subplot(211)
        plt.plot(self.trials.losses(), '.', markersize=12)
        plt.title('Per-Iteration Outer Loss', fontsize=16)
        plt.ylabel('Outer loss function value')
        if self.outer_loss_function in ['logloss']:
            plt.yscale('log')
        xticks = [int(i) for i in np.linspace(plt.xlim()[0], plt.xlim()[1], min(len(self.trials.losses()), 11))]
        plt.xticks(xticks, xticks)


        plt.subplot(212)
        plt.plot(np.minimum.accumulate(self.trials.losses()), '.', markersize=12)
        plt.title('Cumulative Minimum Outer Loss', fontsize=16)
        plt.xlabel('Iteration number')
        plt.ylabel('Outer loss function value')
        xticks = [int(i) for i in np.linspace(plt.xlim()[0], plt.xlim()[1], min(len(self.trials.losses()), 11))]
        plt.xticks(xticks, xticks)

        plt.tight_layout()
        plt.savefig(self.hyperopt_progress_plot)
        self.logger.info('The diagnostic hyperopt progress plot is saved: %s' % self.hyperopt_progress_plot)
def main(main_args):

    usage = "%prog project label_file splits_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='LR',
                      help='Model: (LR|SVM|MNB|SVMNB); default=%default')
    parser.add_option('-o', dest='output_prefix', default='bayes_opt',
                      help='Output prefix')
    parser.add_option('--alpha', dest='alpha', action="store_true", default=False,
                      help='Include alpha in search space (instead of grid search); default=%default')
    parser.add_option('--n_dev_folds', dest='n_dev_folds', default=None,
                      help='Number of dev folds to use when tuning/evaluating; default=%default')
    parser.add_option('-t', dest='target_col', default=2,
                      help='Index of column containing labels; default=%default')
    parser.add_option('-w', dest='weight_col', default=-1,
                      help='Index of column containing weights (-1 for None); default=%default')
    parser.add_option('-v', dest='verbose', default=1,
                      help='Level of verbosity; default=%default')
    parser.add_option('-n', dest='n_iter', default=2,
                      help='Number of iterations; default=%default')
    parser.add_option('-a', dest='add_pseudo', action="store_true", default=False,
                      help='Make use of pseudo-documents; default=%default')
    parser.add_option('--random', dest='random_search', action="store_true", default=False,
                      help='Use random search instead of TPE; default=%default')
    parser.add_option('--personas_old', dest='personas_old', action="store_true", default=False,
                      help='Use personas from the DPM; default=%default')
    parser.add_option('--personas_new', dest='personas_new', action="store_true", default=False,
                      help='Use personas from my model; default=%default')
    parser.add_option('--story_types', dest='story_types', action="store_true", default=False,
                      help='Use story types from my model; default=%default')



    #parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()
    if len(args) == 0:
        args = main_args

    global output_dirname, output_filename, search_alpha, space, label_file, group, n_dev_folds
    global weight_col, verbose, target, add_pseudo, output_prefix
    global personas_old, personas_new, story_types

    project = args[0]
    label_file = args[1]
    splits_file = args[2]

    dirs.set_project(project, splits_file)

    search_alpha = options.alpha
    output_prefix = options.output_prefix
    model = options.model
    if options.n_dev_folds is not None:
        n_dev_folds = int(options.n_dev_folds)
    else:
        n_dev_folds = None
    weight_col = int(options.weight_col)
    target = int(options.target_col)
    verbose = int(options.verbose)
    n_iter = int(options.n_iter)
    add_pseudo = options.add_pseudo
    random_search = options.random_search
    personas_old = options.personas_old
    personas_new = options.personas_new
    story_types = options.story_types

    # allow user to specfiy a particular choice of model
    if model == 'LR':
        space['model'] = {
            'model': 'LR',
            #'regularization': hp.choice('regularization', ['l1', 'l2'])
            'regularization': 'l1'
        }
    elif model == 'SVM':
        space['model'] = {
            'model': 'SVM',
            'kernel': hp.choice('ktype', [
                {'ktype': 'linear'},
                {'ktype': 'poly', 'degree': hp.choice('degree', [2, 3, 4])},
                {'ktype': 'rbf'}
            ]
                                )
        }
    elif model == 'MNB':
        space['model'] = {
            'model': 'MNB'
        }
    elif model == 'SVMNB':
        space['model'] = {
            'model': 'SVMNB',
            'beta': hp.uniform('beta', 0, 1)
        }
    else:
        sys.exit('Choice of model not supported!')

    if personas_old:
        space['features']['personas'] = hp.choice('personas', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': hp.choice('personas_transform', ['binarize', 'normalizel1']),
                'subdir': 'personas',
                'source': 'personasdpm',
            }
        ])
    if personas_new:
        space['features']['personas'] = hp.choice('personas', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': hp.choice('personas_transform', ['binarize', 'normalizel1']),
                'subdir': 'personas',
                'source': 'personas',
            }
        ])
    if story_types:
        space['features']['storytypes'] = hp.choice('storytypes', [
            {
                'use': False
            },
            {
                'use': True,
                'transform': 'normalizel1',
                'subdir': 'personas',
                'source': 'storytypesold',
            }
        ])


    if add_pseudo:
        add_pseudo_options()

    output_prefix += '_' + model

    if search_alpha:
        space['alpha'] = hp.loguniform('alpha', -3, 10)
        output_prefix += '_alpha'
    else:
        output_prefix += '_noalpha'


    all_items, target_name, labels, weights, _ = lr.get_labels(label_file, target, weight_col=weight_col)
    #for t in range(10):
    #    output_dirname = experiment2.make_exp_dir(t, target_name, output_prefix)
    output_dirname = fh.makedirs(dirs.exp_dir, target_name)
    output_filename = os.path.join(output_dirname, output_prefix + '.log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        output_file.write('search alphas = ' + str(search_alpha) + '\n')

    trials = Trials()

    if random_search:
        best = fmin(call_experiment,
                    space=space,
                    algo=rand.suggest,
                    max_evals=n_iter,
                    trials=trials)
    else:
        best = fmin(call_experiment,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=n_iter,
                    trials=trials)

    print space_eval(space, best)
    print trials.losses()
		#"""

		log_file = "%s/%s_hyperopt.log" % (log_path, feat_name)
		log_handler = open( log_file, 'wb' )
		writer = csv.writer( log_handler )
		headers = [ 'trial_counter', 'mean']
		for k,v in sorted(param_space.items()):
			headers.append(k)
		writer.writerow( headers )
		log_handler.flush()

		print("************************************************************")
		print("Search for the best params")
		#global trial_counter
		trial_counter = 0
		trials = Trials()
		objective = lambda p: hyperopt_wrapper(p, feat_folder, feat_name)
		best_params = fmin(objective, param_space, algo=tpe.suggest, 
			trials=trials, max_evals=param_space["max_evals"])

		print("************************************************************")
		print("Best params")
		for k,v in best_params.items():
			print "        %s: %s" % (k,v)
		trial_cv = np.asarray(trials.losses(), dtype=float)
		best_mean = max(trial_cv)
		ind = np.where(trial_cv == best_mean)[0][0]
		print("best cv")
		print("        Mean: %.6f\n" % (best_mean))

def main():

    usage = "%prog <DRLD|MOLD|MIP|Primary|General|PK-...>"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='basic',
                      help='Model: (basic|GRU|LSTM); default=%default')
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_mod',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')
    parser.add_option('--mod', dest='mod', action="store_true", default=False,
                      help='Use modifications; default=%default')


    (options, args) = parser.parse_args()


    global output_dirname, output_filename, reuse, search_alpha, space, mod, dataset
    reuse = options.reuse
    output_dirname = options.output_dirname
    model = options.model
    mod = options.mod

    dataset = args[0]

    if model == 'basic':
        space['arch']['unit'] = 'basic'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 200, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -4, -1),
    elif model == 'GRU':
        space['arch']['unit'] = 'GRU'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 150, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    elif model == 'LSTM':
        space['arch']['unit'] = 'LSTM'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 100, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    else:
        sys.exit('Model not supported!')

    output_dirname += '_' + model

    if reuse:
        output_dirname += '_reuse'

    if mod:
        output_dirname += '_mod'

    output_dirname += '_' + dataset

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=60,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
class RandFModel():
    
    def __init__(self):
        self.trials = Trials()
        
    
    def score(self, params):
        """
        The score function to use in hyperopt.
        It is average of the validation error of xgboost models, using a K-fold validation

        df_train and kf (k-fold) need to be defined previously
        """

        

        if self.output_file != None:
            with open(self.output_file, "a") as myfile:
                try:
                    myfile.write(str(self.trials.losses()[-2])+'\n')
                except IndexError:
                    pass
                myfile.write(str(params)+', ')
                   
                             
        print "Training with params : "
        print params
        n_estimators = int(params['n_estimators'])
        del params['n_estimators']
                             
        score = 0.
        for train_index, valid_index in self.kf:

            df_train = self.df_train.iloc[train_index]
            df_valid = self.df_train.iloc[valid_index]
            
            model = RandomForestClassifier(n_estimators=n_estimators, **params)

            # fit the model
            model.fit(df_train[self.features], df_train[self.target])
            
            # computing the accuracy of predictited similar pictures
            accuracy = model.score(df_valid[self.features], df_valid[self.target])
            print 'accuracy:', accuracy
            score -= accuracy/float(len(self.kf))

        print "\tScore {0}\n\n".format(score)
        return {'loss': score, 'status': STATUS_OK}



    def optimize(self, space, df_train, features, target, kf, output_file=None):
        """
        Find the best hyperparameters in the parameters-space, that minimize the score function.
        
        INPUTS:
        
        space    : (dict) space of paramters to explore
        df_train : (pd.DataFrame) contains the training data: the features and the target
        features : (string list) list of column names to use as features in the model
        target   : (string) name of the column to use as target
        kf       : (sklearn.cross_validation object) a KFold object for cross validation
        """
        
        self.df_train = df_train
        self.features = features
        self.target = target
        self.kf = kf
        self.output_file = output_file

        # searching for the best parameters:
        self.best = fmin(self.score, space, algo=tpe.suggest, trials=self.trials, max_evals=250)

        print 'best parameters:', self.best
        
        if self.output_file != None:
            with open(self.output_file, "a") as myfile:
                myfile.write(str(self.trials.losses()[-2])+'\n')
                myfile.write('best_parameters, '+str(self.best))
        
        best = self.best.copy()
        n_estimators = int(best['n_estimators'])
        del best['n_estimators']
        
        # fit a RandomForest model with the best parameters on the whole training data
        self.model = RandomForestClassifier(n_estimators=n_estimators, **best)
        self.model.fit(df_train[self.features], df_train[self.target])
              

    def predict(self, df):
        """
        return the output of the model for data in df
        """
        return self.model.predict(df)
class XgbModel():
    
    def __init__(self):
        self.trials = Trials()
        
    
    def score(self, params):
        """
        The score function to use in hyperopt.
        It is average of the validation error of xgboost models, using a K-fold validation

        df_train and kf (k-fold) need to be defined previously
        """
        
        if self.output_file != None:
            with open(self.output_file, "a") as myfile:
                try:
                    myfile.write(str(self.trials.losses()[-2])+'\n')
                except IndexError:
                    print 'Index error'
                myfile.write(str(params)+', ')

        print "Training with params : "
        print params
        num_round = int(params['n_estimators'])
        del params['n_estimators']

        score = 0.
        for train_index, valid_index in self.kf:

            df_train = self.df_train.iloc[train_index]
            df_valid = self.df_train.iloc[valid_index]

            # fit the model
            self.fit(df_train, self.features, self.target, params, num_round)

            # results of the model on validation data
            predictions = self.predict(df_valid[self.features])

            # computing the accuracy of predictited similar pictures
            accuracy = np.mean(df_valid[self.target].values == np.round(predictions))
            print 'accuracy:', accuracy
            score -= accuracy/float(len(self.kf))
            
            #score -= roc_auc_score(df_valid[self.target].values, predictions)

        print "\tScore {0}\n\n".format(score)
        return {'loss': score, 'status': STATUS_OK}



    def optimize(self, space, df_train, features, target, kf, output_file=None):
        """
        Find the best hyperparameters in the parameters-space, that minimize the score function.
        
        INPUTS:
        
        space    : (dict) space of paramters to explore
        df_train : (pd.DataFrame) contains the training data: the features and the target
        features : (string list) list of column names to use as features in the model
        target   : (string) name of the column to use as target
        kf       : (sklearn.cross_validation object) a KFold object for cross validation
        """
        
        self.df_train = df_train
        self.features = features
        self.target = target
        self.kf = kf
        self.output_file = output_file

        # searching for the best parameters:
        self.best = fmin(self.score, space, algo=tpe.suggest, trials=self.trials, max_evals=250)

        print best
        
        best = self.best.copy()
        numrounds = int(best['n_estimators'])
        del best['n_estimators']
        
        # fit an Xgboost model with the best parameters on the whole training data
        self.fit(df_train, features, target, best, numrounds)
              
        
        
    def fit(self, df, features, target, params, numrounds):
        """
        Fit an Xgboost model with parameters defined in params and numrounds
        """
        X_train = df[features]
        y_train = df[target]
        dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        self.model = xgb.train(params, dtrain,  numrounds )

    def predict(self, df):
        """
        return the output of the model for data in df
        """
        data = xgb.DMatrix(df.values)
        return self.model.predict(data)
Exemple #59
0
    def work(self, **kwargs):
        self.__dict__.update(kwargs)
        bandit = opt_q_uniform(self.target)
        prior_weight = 2.5
        gamma = 0.20
        algo = partial(tpe.suggest,
                prior_weight=prior_weight,
                n_startup_jobs=2,
                n_EI_candidates=128,
                gamma=gamma)
        #print algo.opt_idxs['x']
        #print algo.opt_vals['x']

        trials = Trials()
        fmin(passthrough,
            space=bandit.expr,
            algo=algo,
            trials=trials,
            max_evals=self.LEN)
        if self.show_vars:
            import hyperopt.plotting
            hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)

        idxs, vals = miscs_to_idxs_vals(trials.miscs)
        idxs = idxs['x']
        vals = vals['x']

        losses = trials.losses()

        from hyperopt.tpe import ap_filter_trials
        from hyperopt.tpe import adaptive_parzen_samplers

        qu = scope.quniform(1.01, 10, 1)
        fn = adaptive_parzen_samplers['quniform']
        fn_kwargs = dict(size=(4,), rng=np.random)
        s_below = pyll.Literal()
        s_above = pyll.Literal()
        b_args = [s_below, prior_weight] + qu.pos_args
        b_post = fn(*b_args, **fn_kwargs)
        a_args = [s_above, prior_weight] + qu.pos_args
        a_post = fn(*a_args, **fn_kwargs)

        #print b_post
        #print a_post
        fn_lpdf = getattr(scope, a_post.name + '_lpdf')
        print fn_lpdf
        # calculate the llik of b_post under both distributions
        a_kwargs = dict([(n, a) for n, a in a_post.named_args
                    if n not in ('rng', 'size')])
        b_kwargs = dict([(n, a) for n, a in b_post.named_args
                    if n not in ('rng', 'size')])
        below_llik = fn_lpdf(*([b_post] + b_post.pos_args), **b_kwargs)
        above_llik = fn_lpdf(*([b_post] + a_post.pos_args), **a_kwargs)
        new_node = scope.broadcast_best(b_post, below_llik, above_llik)

        print '=' * 80

        do_show = self.show_steps

        for ii in range(2, 9):
            if ii > len(idxs):
                break
            print '-' * 80
            print 'ROUND', ii
            print '-' * 80
            all_vals = [2, 3, 4, 5, 6, 7, 8, 9, 10]
            below, above = ap_filter_trials(idxs[:ii],
                    vals[:ii], idxs[:ii], losses[:ii], gamma)
            below = below.astype('int')
            above = above.astype('int')
            print 'BB0', below
            print 'BB1', above
            #print 'BELOW',  zip(range(100), np.bincount(below, minlength=11))
            #print 'ABOVE',  zip(range(100), np.bincount(above, minlength=11))
            memo = {b_post: all_vals, s_below: below, s_above: above}
            bl, al, nv = pyll.rec_eval([below_llik, above_llik, new_node],
                    memo=memo)
            #print bl - al
            print 'BB2', dict(zip(all_vals, bl - al))
            print 'BB3', dict(zip(all_vals, bl))
            print 'BB4', dict(zip(all_vals, al))
            print 'ORIG PICKED', vals[ii]
            print 'PROPER OPT PICKS:', nv

            #assert np.allclose(below, [3, 3, 9])
            #assert len(below) + len(above) == len(vals)

            if do_show:
                plt.subplot(8, 1, ii)
                #plt.scatter(all_vals,
                #    np.bincount(below, minlength=11)[2:], c='b')
                #plt.scatter(all_vals,
                #    np.bincount(above, minlength=11)[2:], c='c')
                plt.scatter(all_vals, bl, c='g')
                plt.scatter(all_vals, al, c='r')
        if do_show:
            plt.show()
            data = [X_all, y_class_tr_all, y_reg_tr_all]
            # =========================== Search the best params ===========================
            print("------------------------------------------------------------------------")
            print("-------- Search the best params for %s --------" % ftmodnm)
            starttime = time.clock()
            log_handler = log(ftmodnm)
            trial_counter = 0
            ftmodinfo = [model, data]
            trials = Trials()
            objective = lambda p: hyperopt_wrapper(p, ftmodinfo)
            best_params = fmin(objective, param, algo=tpe.suggest, trials=trials, max_evals=param["max_evals"])

            for f in modp.int_feat():
                if f in best_params:
                    best_params[f] = int(best_params[f])
            elapsed = round((time.clock() - starttime) / 60.0, 2)
            print("************************************************************")
            print("Best params for %s in %.2f min" %(ftmodnm, elapsed))
            for k, v in best_params.items():
                print("        %s: %s" % (k, v))
            trial_RMSEs = np.asarray(trials.losses(), dtype=float)
            best_RMSE_mean = min(trial_RMSEs)
            ind = np.where(trial_RMSEs == best_RMSE_mean)[0][0]
            best_RMSE_std = trials.trial_attachments(trials.trials[ind])['std']
            print("RMSE stats")
            print("        Mean: %.6f\n        Std: %.6f" % (best_RMSE_mean, best_RMSE_std))
            print("        Trial: %s" % str(ind + 1))
            print("************************************************************")
            print()