Esempio n. 1
0
def optimize_model_pytorch(device, args, train_GWAS, train_y, test_GWAS, test_y, out_folder ="", startupJobs = 40, maxevals = 200, noOut = False):
    global numTrials_pytorch
    numTrials_pytorch= 0

    trials = Trials()
    trial_wrapper = partial(trial_pytorch,device = device, args = args , train_GWAS = train_GWAS, train_y = train_y , test_GWAS = test_GWAS , test_y = test_y)

    best_pars = fmin(trial_wrapper, parameter_space_pytorch(), algo=partial(tpe.suggest, n_startup_jobs=(startupJobs) ), max_evals=maxevals, trials=trials)

    # Print the selected 'best' hyperparameters.
    if noOut == False: print('\nBest hyperparameter settings: ',space_eval(parameter_space_pytorch(), best_pars),'\n')

    # loops through the 1st entry in the dict that holds all the lookup keys
    regression = True

    for p in trials.trials[0]['misc']['idxs']: plot_optimization_pytorch(trials, p, regression, out_folder = out_folder) 

    best_pars = space_eval(parameter_space_pytorch(), best_pars) # this turns the indices into the actual params into the valid aprameter space
    
    # override the epochs with the early start
    lowestLossIndex = np.argmin(trials.losses())
    trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] = trials.trial_attachments(trials.trials[lowestLossIndex])['highestAcc_epoch']
    best_pars['earlyStopEpochs'] += 1 # as epochs are 0 based otherwise...
    best_pars['epochs'] = best_pars['earlyStopEpochs'] 
    if best_pars['epochs'] <= 0 : best_pars['epochs'] = 1 # we dont want a network without any training, as that will cause a problem for deep dreaming
    return(best_pars)
Esempio n. 2
0
def run_all_dl(csvfile = saving_fp, 
                space = [hp.quniform('h1', 100, 550, 1), 
                        hp.quniform('h2', 100, 550, 1),
                        hp.quniform('h3', 100, 550, 1),
                        #hp.choice('activation', ["RectifierWithDropout", "TanhWithDropout"]),
                        hp.uniform('hdr1', 0.001, 0.3),
                        hp.uniform('hdr2', 0.001, 0.3),
                        hp.uniform('hdr3', 0.001, 0.3),
                        hp.uniform('rho', 0.9, 0.999), 
                        hp.uniform('epsilon', 1e-10, 1e-4)]):
          # maxout works well with dropout (Goodfellow et al 2013), and rectifier has worked well with image recognition (LeCun et al 1998)
          start_save(csvfile = csvfile)
          trials = Trials()
          print "Deep learning..."
          best = fmin(objective,
                      space = space,
                      algo=tpe.suggest,
                      max_evals=evals,
                      trials=trials)
          print best
          print trials.losses()
          with open('output/dlbest.pkl', 'w') as output:
            pickle.dump(best, output, -1)
          with open('output/dltrials.pkl', 'w') as output:
            pickle.dump(trials, output, -1)
Esempio n. 3
0
def notest_opt_qn_normal(f=hp_normal):
    bandit = Bandit(
            {'loss': scope.sum([f('v%i' % ii, 0, 1)
                for ii in range(25)]) ** 2},
            loss_target=0)
    algo = TreeParzenEstimator(bandit,
            prior_weight=.5,
            n_startup_jobs=0,
            n_EI_candidates=1,
            gamma=0.15)
    trials = Trials()
    experiment = Experiment(trials, algo, async=False)
    experiment.max_queue_len = 1
    experiment.run(40)
    print 'sorted losses:', list(sorted(trials.losses()))

    idxs, vals = miscs_to_idxs_vals(trials.miscs)

    if 1:
        import hyperopt.plotting
        hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)
    else:
        import matplotlib.pyplot as plt
        begin = [v[:10] for k, v in vals.items()]
        end = [v[-10:] for k, v in vals.items()]
        plt.subplot(2, 1, 1)
        plt.title('before')
        plt.hist(np.asarray(begin).flatten())
        plt.subplot(2, 1, 2)
        plt.title('after')
        plt.hist(np.asarray(end).flatten())
        plt.show()
Esempio n. 4
0
 def run(self):
     start = time.time()
     trials = Trials()
     best = fmin(self._obj, self.model_param_space._build_space(), tpe.suggest, self.max_evals, trials)
     best_params = space_eval(self.model_param_space._build_space(), best)
     best_params = self.model_param_space._convert_int_param(best_params)
     trial_rmses = np.asarray(trials.losses(), dtype=float)
     best_ind = np.argmin(trial_rmses)
     best_rmse_mean = trial_rmses[best_ind]
     best_rmse_std = trials.trial_attachments(trials.trials[best_ind])["std"]
     self.logger.info("-"*50)
     self.logger.info("Best RMSE")
     self.logger.info("      Mean: %.6f"%best_rmse_mean)
     self.logger.info("      std: %.6f"%best_rmse_std)
     self.logger.info("Best param")
     self.task._print_param_dict(best_params)
     end = time.time()
     _sec = end - start
     _min = int(_sec/60.)
     self.logger.info("Time")
     if _min > 0:
         self.logger.info("      %d mins"%_min)
     else:
         self.logger.info("      %d secs"%_sec)
     self.logger.info("-"*50)
Esempio n. 5
0
File: run.py Progetto: benbo/botc
def main():

    usage = "%prog text.json labels.csv feature_dir output_dir"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='max_iter', default=4,
                      help='Maximum iterations of Bayesian optimization; default=%default')

    (options, args) = parser.parse_args()
    max_iter = int(options.max_iter)

    global data_filename, label_filename, feature_dir, output_dir, log_filename

    data_filename = args[0]
    label_filename = args[1]
    feature_dir = args[2]
    output_dir = args[3]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    log_filename = os.path.join(output_dir, 'log.txt')

    with open(log_filename, 'w') as logfile:
        logfile.write(','.join([data_filename, label_filename, feature_dir, output_dir]))

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Esempio n. 6
0
        def work(self):
            bandit = self.bandit
            random_algo = Random(bandit)
            # build an experiment of 10 trials
            trials = Trials()
            exp = Experiment(trials, random_algo)
            #print random_algo.s_specs_idxs_vals
            exp.run(10)
            ids = trials.tids
            assert len(ids) == 10
            tpe_algo = TreeParzenEstimator(bandit)
            #print pyll.as_apply(tpe_algo.post_idxs)
            #print pyll.as_apply(tpe_algo.post_vals)
            argmemo = {}

            print trials.miscs
            idxs, vals = miscs_to_idxs_vals(trials.miscs)
            argmemo[tpe_algo.observed['idxs']] = idxs
            argmemo[tpe_algo.observed['vals']] = vals
            argmemo[tpe_algo.observed_loss['idxs']] = trials.tids
            argmemo[tpe_algo.observed_loss['vals']] = trials.losses()
            stuff = pyll.rec_eval([tpe_algo.post_below['idxs'],
                        tpe_algo.post_below['vals']],
                        memo=argmemo)
            print stuff
def main():

    usage = "%prog"
    parser = OptionParser(usage=usage)
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_chars',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space
    reuse = options.reuse
    output_dirname = options.output_dirname

    if reuse:
        output_dirname += '_reuse'

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename_wo_ext(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=100,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Esempio n. 8
0
def optimize(obj_function, inputs, key_file, space, max_eval):

    trials = Trials()
    f = partial(obj_function, inputs, key_file)
    best = fmin(f, space=space, algo=tpe.suggest, max_evals=max_eval,
                trials=trials)
    LOGGER.info("{}\t{}".format(best, 1 - min(trials.losses())))
def optimize_model_parameter_split(x, y, model_name=None, loss_function="accuracy", parameter=None, max_evals=100, n_folds=5, isWrite=True, times=1, problem_pattern="classification"):
    """
    hyperopt model turning
    """
    if model_name == None and parameter == None:
        print "you must set parameter or model_name"
        return None
    elif parameter != None:
        param = parameter
    elif model_name != None:
        param = parameter_dictionary[model_name]
    else:
        return None

    x_trains = []
    x_tests = []
    y_trains = []
    y_tests = []

    for time in xrange(times):
        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
            x, y, test_size=0.0125)
        x_trains.append(x_train)
        x_tests.append(x_test)
        y_trains.append(y_train)
        y_tests.append(y_test)

    trials = Trials()
    function = lambda param: optimize_model_function_split(
        param, x_trains, x_tests, y_trains, y_tests, loss_function)
    print param
    print "========================================================================"
    best_param = fmin(function, param,
                      algo=tpe.suggest, max_evals=max_evals, trials=trials)
    print "========================================================================"
    print "write result to csv files"

    # write the csv file
    if isWrite:
        datas = []
        for trial_data in trials.trials:
            print trial_data
            trial_parameter_dictionary = {}
            trial_parameter_dictionary['model'] = model_name
            trial_parameter_dictionary['tid'] = trial_data['misc']['tid']
            for key, value in trial_data['misc']['vals'].items():
                print key, value[0]
                trial_parameter_dictionary[key] = value[0]
            trial_parameter_dictionary['loss'] = trial_data['result']['loss']
            trial_parameter_dictionary[
                'status'] = trial_data['result']['status']
            datas.append(trial_parameter_dictionary)
        filename = str(time.time()) + ".csv"
        dictionary_in_list_convert_to_csv(datas, filename)

    print trials.statuses()
    return best_param
Esempio n. 10
0
 def test_basic(self):
     bandit = self._bandit_cls()
     #print 'bandit params', bandit.params, bandit
     #print 'algo params', algo.vh.params
     trials = Trials()
     fmin(lambda x: x, bandit.expr,
          trials=trials,
          algo=suggest,
          max_evals=self._n_steps)
     assert trials.average_best_error(bandit) - bandit.loss_target  < .2
Esempio n. 11
0
 def test_basic(self):
     domain = self._domain_cls()
     # print 'domain params', domain.params, domain
     # print 'algo params', algo.vh.params
     trials = Trials()
     fmin(lambda x: x, domain.expr,
          trials=trials,
          algo=suggest,
          max_evals=self._n_steps)
     assert trials.average_best_error(domain) - domain.loss_target < .2
Esempio n. 12
0
def main():
    set_globals()
    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=max_iter,
                trials=trials)
    
    print space_eval(space, best)
    print "losses:", [-l for l in trials.losses()]
    print('the best loss: ', max([-l for l in trials.losses()]))
    print("number of trials: " + str(len(trials.trials)))
Esempio n. 13
0
 def test_basic(self):
     bandit = self._bandit_cls()
     algo = Random(bandit)
     trials = Trials()
     experiment = Experiment(trials, algo, async=False)
     experiment.max_queue_len = 50
     experiment.run(self._n_steps)
     print
     print self._bandit_cls
     print bandit.loss_target
     print trials.average_best_error(bandit)
     assert trials.average_best_error(bandit) - bandit.loss_target  < .2
     print
Esempio n. 14
0
 def test_basic(self):
     bandit = self._bandit_cls()
     print 'bandit params', bandit.params
     algo = Random(bandit)
     print 'algo params', algo.vh.params
     trials = Trials()
     experiment = Experiment(trials, algo, async=False)
     experiment.catch_bandit_exceptions = False
     experiment.max_queue_len = 50
     experiment.run(self._n_steps)
     print
     print self._bandit_cls
     print bandit.loss_target
     print trials.average_best_error(bandit)
     assert trials.average_best_error(bandit) - bandit.loss_target  < .2
     print
Esempio n. 15
0
def TunningParamter(param,data,features,feature,source_name,real_value,int_boolean):
    data = data[~pd.isnull(all_data[feature])]
    print data.shape
    ISOTIMEFORMAT='%Y-%m-%d %X'
    start = time.strftime(ISOTIMEFORMAT, time.localtime())
    trials = Trials()
    objective = lambda p : trainModel(p, data, features, feature,source_name,real_value,int_boolean)
    
    best_parameters = fmin(objective, param, algo =tpe.suggest,max_evals=param['max_evals'],trials= trials)
    #now we need to get best_param
    trials_loss = np.asanyarray(trials.losses(),dtype=float)
    best_loss = min(trials_loss)
    ind = np.where(trials_loss==best_loss)[0][0]
    best_loss_std = trials.trial_attachments(trials.trials[ind])['std']
    end = time.strftime(ISOTIMEFORMAT,time.localtime())
    dumpMessage(best_parameters, best_loss, best_loss_std,param['task'],source_name,start,end)
    def __init__(self, train_set, holdout_set, command, max_evals=100,
                 outer_loss_function='logistic',
                 searcher='tpe', is_regression=False):
        self.train_set = train_set
        self.holdout_set = holdout_set

        self.train_model = './current.model'
        self.holdout_pred = './holdout.pred'
        self.trials_output = './trials.json'
        self.hyperopt_progress_plot = './hyperopt_progress.png'
        self.log = './log.log'

        self.logger = self._configure_logger()

        # hyperopt parameter sample, converted into a string with flags
        self.param_suffix = None
        self.train_command = None
        self.validate_command = None

        self.y_true_train = []
        self.y_true_holdout = []

        self.outer_loss_function = outer_loss_function
        self.space = self._get_space(command)
        self.max_evals = max_evals
        self.searcher = searcher
        self.is_regression = is_regression

        self.trials = Trials()
        self.current_trial = 0
Esempio n. 17
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(
            tree.suggest,
            # XXX (begin)
            n_trees=10,
            logprior_strength=1.0,
            # XXX (end)
                )
        LEN = self.LEN.get(bandit.name, 75)

        trials = Trials()
        fmin(fn=passthrough,
            space=self.bandit.expr,
            trials=trials,
            algo=algo,
            max_evals=LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(fn=passthrough,
                space=self.bandit.expr,
                trials=rtrials,
                algo=rand.suggest,
                max_evals=LEN)
            print 'RANDOM BEST 6:', list(sorted(rtrials.losses()))[:6]

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)

        #print trials.losses()
        print 'OPT BEST 6:', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
 def minimize(self, restarts=2, epochs=600, tune_space=None):
     from hyperopt import fmin, tpe, Trials
     if tune_space is None:
         initial_values = self.tf_session.run(self.variables)
         tune_space = self._make_tune_space(initial_values)
     # TODO: This report structure has the downside of not writing
     # anything to disk until it's 100% complete.
     reports = []
     # Make minimize deterministic
     R = np.random.RandomState(self.seed)
     for restarts in range(restarts):
         trials = Trials()
         best = fmin(fn=self._evaluate,
                     space=tune_space,
                     algo=tpe.suggest,
                     max_evals=epochs,
                     trials=trials,
                     rstate=R)
         self._assign_values(best)
         reports.extend(trials.trial_attachments(t)['report'] for t in trials.trials)
     return self.evaluator.make_agg_report(reports)
Esempio n. 19
0
def run_all_gbm(csvfile = saving_fp, 
                space = [hp.quniform('ntrees', 200, 750, 1), hp.quniform('max_depth', 5, 15, 1), hp.uniform('learn_rate', 0.03, 0.35)]):
  # Search space is a stochastic argument-sampling program:
  start_save(csvfile = csvfile)
  trials = Trials()
  best = fmin(objective,
      space = space,
      algo=tpe.suggest,
      max_evals=evals,
      trials=trials)
  print best
  # from hyperopt import space_eval
  # print space_eval(space, best)
  # trials.trials # list of dictionaries representing everything about the search
  # trials.results # list of dictionaries returned by 'objective' during the search
  print trials.losses() # list of losses (float for each 'ok' trial)
  # trials.statuses() # list of status strings
  with open('output/gbmbest.pkl', 'w') as output:
    pickle.dump(best, output, -1)
  with open('output/gbmtrials.pkl', 'w') as output:
    pickle.dump(trials, output, -1)
Esempio n. 20
0
    def hyperopt_search(self, parallel=False):  # TODO: implement parallel search with MongoTrials
        def objective(kwargs):
            start = dt.now()
            self.get_hyperparam_string(**kwargs)
            self.fit_vw()
            self.validate_vw()
            loss = self.validation_metric_vw()

            finish = dt.now()
            elapsed = finish - start
            self.logger.info("evaluation time for this step: %s" % str(elapsed))

            # clean up
            subprocess.call(shlex.split('rm %s %s' % (self.train_model, self.holdout_pred)))

            to_return = {'status': STATUS_OK,
                         'loss': loss,  # TODO: include also train loss tracking in order to prevent overfitting
                         'eval_time': elapsed,
                         'train_command': self.train_command
                        }
            return to_return

        trials = Trials()
        if self.searcher == 'tpe':
            algo = tpe.suggest
        elif self.searcher == 'rand':
            algo = rand.suggest

        logging.debug("starting hypersearch...")
        best_params = fmin(objective, space=self.space, trials=trials, algo=algo, max_evals=self.max_evals)
        self.logger.debug("the best hyperopt parameters: %s" % str(best_params))

        best_configuration = trials.results[np.argmin(trials.losses())]['train_command']
        best_loss = trials.results[np.argmin(trials.losses())]['loss']
        self.logger.info("\n\nA FULL TRAINING COMMAND WITH THE BEST HYPERPARAMETERS: \n%s" % best_configuration)
        self.logger.info("\n\nTHE BEST LOSS VALUE: \n%s" % best_loss)

        return best_configuration, best_loss
Esempio n. 21
0
    def hyperopt_search(self, parallel=False):  # TODO: implement parallel search with MongoTrials
        def objective(kwargs):
            start = dt.now()

            self.current_trial += 1
            self.logger.info('\n\nStarting trial no.%d' % self.current_trial)
            self.get_hyperparam_string(**kwargs)
            self.fit_vw()
            self.validate_vw()
            loss = self.validation_metric_vw()

            finish = dt.now()
            elapsed = finish - start
            self.logger.info("evaluation time for this step: %s" % str(elapsed))

            # clean up
            subprocess.call(shlex.split('rm %s %s' % (self.train_model, self.holdout_pred)))

            to_return = {'status': STATUS_OK,
                         'loss': loss,  # TODO: include also train loss tracking in order to prevent overfitting
                         'eval_time': elapsed.seconds,
                         'train_command': self.train_command,
                         'current_trial': self.current_trial
            }
            return to_return

        self.trials = Trials()
        if self.searcher == 'tpe':
            algo = tpe.suggest
        elif self.searcher == 'rand':
            algo = rand.suggest
        else:
            raise KeyError('Invalid searcher')

        logging.debug("starting hypersearch...")
        best_params = fmin(objective, space=self.space, trials=self.trials, algo=algo, max_evals=self.max_evals)
        self.logger.debug("the best hyperopt parameters: %s" % str(best_params))

        json.dump(self.trials.results, open(self.trials_output, 'w'))
        self.logger.info('All the trials results are saved at %s' % self.trials_output)

        best_configuration = self.trials.results[np.argmin(self.trials.losses())]['train_command']
        best_loss = self.trials.results[np.argmin(self.trials.losses())]['loss']
        self.logger.info("\n\nA full training command with the best hyperparameters: \n%s\n\n" % best_configuration)
        self.logger.info("\n\nThe best holdout loss value: \n%s\n\n" % best_loss)

        return best_configuration, best_loss
Esempio n. 22
0
    def work(self):
        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(anneal.suggest)
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(fn=passthrough, space=self.bandit.expr, trials=trials, algo=algo, max_evals=LEN)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(fn=passthrough, space=self.bandit.expr, trials=rtrials, algo=rand.suggest, max_evals=LEN)
            print("RANDOM BEST 6:", list(sorted(rtrials.losses()))[:6])

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(list(range(LEN)), trials.losses())
            plt.title("TPE losses")
            plt.subplot(2, 2, 2)
            plt.scatter(list(range(LEN)), ([s["x"] for s in trials.specs]))
            plt.title("TPE x")
            plt.subplot(2, 2, 3)
            plt.title("RND losses")
            plt.scatter(list(range(LEN)), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title("RND x")
            plt.scatter(list(range(LEN)), ([s["x"] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist([t["x"] for t in self.experiment.trials], bins=20)

        # print trials.losses()
        print("ANNEAL BEST 6:", list(sorted(trials.losses()))[:6])
        # logx = np.log([s['x'] for s in trials.specs])
        # print 'TPE MEAN', np.mean(logx)
        # print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print("Thresh", thresh)
        assert min(trials.losses()) < thresh
Esempio n. 23
0
                  metrics=['accuracy'])

    model.fit(X_train,
              Y_train,
              batch_size={{choice([64, 128])}},
              nb_epoch=1,
              verbose=2,
              validation_data=(X_test, Y_test))
    score, acc = model.evaluate(X_test, Y_test, verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}


X_train, Y_train, X_test, Y_test = data()

best_run, best_model = optim.minimize(model=create_model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=5,
                                      trials=Trials())

print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print(best_run)

best_run, best_model = optim.minimize(model=create_model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=5,
                                      trials=Trials())
Esempio n. 24
0
    def run(self):
        trails = Trials()

        if self.clf == 'Boost':
            clfdefault_value = {'Boostnestimator': 50, 'BoostLearnrate': 1}
            clfparamSpace = {
                'Boostnestimator': hp.choice('Boostnestimator',
                                             range(10, 1000)),
                'BoostLearnrate': hp.uniform('BoostLearnrate', 0.01, 10)
            }

        if self.clf == 'RF':
            clfdefault_value = {
                'n_estimators': 10,
                'criterion': 'gini',
                'max_features': 'auto',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 100)),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }
        if self.clf == 'BRF':
            clfdefault_value = {
                'n_estimators': 10,
                'criterion': 'gini',
                'max_features': 'auto',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 100)),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        if self.clf == 'SVM':
            clfdefault_value = {
                'SVCkernel': {
                    'kernel': 'poly',
                    'degree': 3,
                    'polycoef0': 0.0,
                    'polygamma': 1
                },
                'C': 1.0
            }
            clfparamSpace = {
                'SVCkernel':
                hp.choice(
                    'SVCkernel',
                    [{
                        'kernel': 'linear'
                    }, {
                        'kernel': 'poly',
                        'degree': hp.choice('degree', range(1, 5)),
                        'polycoef0': hp.uniform('polycoef0', 0, 10),
                        'polygamma': hp.choice('polygamma', ["auto", "scale"])
                    }, {
                        'kernel': 'sigmoid',
                        'sigcoef0': hp.uniform('sigcoef0', 0, 10),
                        'siggamma': hp.choice('siggamma', ["auto", "scale"])
                    }, {
                        'kernel': 'rbf',
                        'rbfgamma': hp.choice('rbfgamma', ["auto", "scale"])
                    }]),
                'C':
                hp.uniform('C', 0.001, 1000),
            }

        if self.clf == 'NN':
            clfdefault_value = {
                'NNactive': 'relu',
                'NNalpha': 0.0001,
                'NNmaxiter': 200
            }
            clfparamSpace = {
                'NNactive':
                hp.choice('NNactive',
                          ['identity', 'logistic', 'tanh', 'relu']),
                'NNalpha':
                hp.uniform('NNalpha', 1e-6, 1),
                'NNmaxiter':
                hp.choice('NNmaxiter', range(100, 1000))
            }

        if self.clf == 'KNN':
            clfdefault_value = {'KNNneighbors': 1}
            clfparamSpace = {
                'KNNneighbors': hp.choice('KNNneighbors', range(1, 50))
            }

        if self.clf == 'NB':
            clfdefault_value = {'NBType': 'gaussian'}
            clfparamSpace = {
                'NBType':
                hp.choice('NBType', ['gaussian', 'multinomial', 'bernoulli'])
            }

        if self.clf == 'Ridge':
            clfdefault_value = {'Ridgealpha': 1, 'Ridgenormalize': False}
            clfparamSpace = {
                'Ridgealpha': hp.uniform('Ridgealpha', 0.001, 1000),
                'Ridgenormalize': hp.choice('Ridgenormlize', [True, False])
            }

        if self.clf == 'CART':
            clfdefault_value = {
                'criterion': 'gini',
                'max_features': 'auto',
                'CARTsplitter': 'best',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'CARTsplitter':
                hp.choice('CARTsplitter', ['best', 'random']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        self.def_value = self.objFunc(clfdefault_value)['result']
        best = fmin(self.objFunc,
                    space=clfparamSpace,
                    algo=tpe.suggest,
                    max_evals=self.fe,
                    trials=self.trails)

        his = dict()
        his['name'] = list(self.trails.trials[0]['misc']['vals'].keys())
        i = 0
        for item in self.trails.trials:
            results = list(deepflatten(item['misc']['vals'].values()))
            results.append(item['result']['result'])
            his[i] = results
            i += 1

        inc_value = self.trails.best_trial['result']['result']

        # print(def_value)
        return np.asarray([self.def_value, inc_value]), his, best
Esempio n. 25
0
    def run(self):
        trails = Trials()

        if self.adaptation == 'TCA':
            adptdefault_value = {
                #                 'kernel_type': 'linear',
                'dim': 5,
                'lamb': 1,
                'gamma': 1
            }
            adptparamSpace = {
                #                 'kernel_type': hp.choice('kernel_type', ['primal', 'linear', 'rbf', 'sam']),
                'dim':
                hp.choice('dim',
                          range(5, max(self.sx.shape[1], self.tx.shape[1]))),
                'lamb':
                hp.uniform('lamb', 1e-6, 1e2),
                'gamma':
                hp.uniform('gamma', 1e-5, 1e2)
            }

        if self.adaptation == 'DBSCANfilter':
            adptdefault_value = {'eps': 1, 'min_samples': 10}
            adptparamSpace = {
                'eps': hp.uniform('eps', 0.1, 1e2),
                'min_samples': hp.choice('min_samples', range(1, 100))
            }

        if self.adaptation == 'Burakfilter':
            adptdefault_value = {'n_neighbors': 10}
            adptparamSpace = {
                'n_neighbors': hp.choice('n_neighbors', range(1, 100))
            }

        if self.adaptation == 'Peterfilter':
            adptdefault_value = {'eachCluster': 10}
            adptparamSpace = {
                'eachCluster': hp.choice('eachCluster', range(1, 100))
            }

        if self.adaptation == 'Universal':
            adptdefault_value = {'pvalue': 0.05, 'QuantifyType': 'cliff'}
            adptparamSpace = {
                'pvalue': hp.uniform('pvalue', 0.01, 0.1),
                'QuantifyType': hp.choice('QuantifyType', ['cliff', 'cohen'])
            }

        if self.adaptation == 'HISNN':
            adptdefault_value = {'MinHam': 1.0, 'HISNNneighbors': 5}
            adptparamSpace = {
                'MinHam': hp.uniform('MinHam', 0.5, 100),
                'HISNNneighbors': hp.choice('HISNNneighbors', range(1, 100))
            }

        if self.adaptation == 'DTB':
            adptdefault_value = {'DTBneighbors': 10, 'DTBT': 20}
            adptparamSpace = {
                'DTBneighbors': hp.choice('DTBneighbors', range(1, 50)),
                'DTBT': hp.choice('DTBT', range(5, 30))
            }

        if self.adaptation == 'DS':
            adptdefault_value = {'DStopn': 5, 'DSfss': 0.2}
            adptparamSpace = {
                'DStopn': hp.choice('DStopn', range(1, 15)),
                'DSfss': hp.uniform('DSfss', 0.1, 0.5)
            }

        if self.adaptation == 'DSBF':
            adptdefault_value = {'DSBFtopk': 1, 'DSBFneighbors': 10}
            adptparamSpace = {
                'DSBFtopk': hp.choice('DSBFtopk', range(1, 10)),
                'DSBFneighbors': hp.choice('DSBFneighbors', range(1, 100))
            }

        self.def_value = self.objFunc(adptdefault_value)['result']
        best = fmin(self.objFunc,
                    space=adptparamSpace,
                    algo=tpe.suggest,
                    max_evals=self.fe,
                    trials=self.trails)
        his = dict()
        his['name'] = list(self.trails.trials[0]['misc']['vals'].keys())
        i = 0
        for item in self.trails.trials:
            results = list(deepflatten(item['misc']['vals'].values()))
            results.append(item['result']['result'])
            his[i] = results
            i += 1

        inc_value = self.trails.best_trial['result']['result']

        return np.asarray([self.def_value, inc_value]), his, best
    def train_model(self, config, reverse_dictionary, train_data, train_labels,
                    test_data, test_labels, l_tool_tr_samples, class_weights):
        """
        Train a model and report accuracy
        """
        # convert items to integer
        l_batch_size = list(map(int, config["batch_size"].split(",")))
        l_embedding_size = list(map(int, config["embedding_size"].split(",")))
        l_units = list(map(int, config["units"].split(",")))

        # convert items to float
        l_learning_rate = list(map(float, config["learning_rate"].split(",")))
        l_dropout = list(map(float, config["dropout"].split(",")))
        l_spatial_dropout = list(
            map(float, config["spatial_dropout"].split(",")))
        l_recurrent_dropout = list(
            map(float, config["recurrent_dropout"].split(",")))

        optimize_n_epochs = int(config["optimize_n_epochs"])

        # get dimensions
        dimensions = len(reverse_dictionary) + 1
        best_model_params = dict()
        early_stopping = EarlyStopping(monitor='val_loss',
                                       mode='min',
                                       verbose=1,
                                       min_delta=1e-1,
                                       restore_best_weights=True)

        # specify the search space for finding the best combination of parameters using Bayesian optimisation
        params = {
            "embedding_size":
            hp.quniform("embedding_size", l_embedding_size[0],
                        l_embedding_size[1], 1),
            "units":
            hp.quniform("units", l_units[0], l_units[1], 1),
            "batch_size":
            hp.quniform("batch_size", l_batch_size[0], l_batch_size[1], 1),
            "learning_rate":
            hp.loguniform("learning_rate", np.log(l_learning_rate[0]),
                          np.log(l_learning_rate[1])),
            "dropout":
            hp.uniform("dropout", l_dropout[0], l_dropout[1]),
            "spatial_dropout":
            hp.uniform("spatial_dropout", l_spatial_dropout[0],
                       l_spatial_dropout[1]),
            "recurrent_dropout":
            hp.uniform("recurrent_dropout", l_recurrent_dropout[0],
                       l_recurrent_dropout[1])
        }

        def create_model(params):
            model = Sequential()
            model.add(
                Embedding(dimensions,
                          int(params["embedding_size"]),
                          mask_zero=True))
            model.add(SpatialDropout1D(params["spatial_dropout"]))
            model.add(
                GRU(int(params["units"]),
                    dropout=params["dropout"],
                    recurrent_dropout=params["recurrent_dropout"],
                    return_sequences=True,
                    activation="elu"))
            model.add(Dropout(params["dropout"]))
            model.add(
                GRU(int(params["units"]),
                    dropout=params["dropout"],
                    recurrent_dropout=params["recurrent_dropout"],
                    return_sequences=False,
                    activation="elu"))
            model.add(Dropout(params["dropout"]))
            model.add(Dense(2 * dimensions, activation="sigmoid"))
            optimizer_rms = RMSprop(lr=params["learning_rate"])
            batch_size = int(params["batch_size"])
            model.compile(loss=utils.weighted_loss(class_weights),
                          optimizer=optimizer_rms)
            print(model.summary())
            model_fit = model.fit_generator(
                utils.balanced_sample_generator(train_data, train_labels,
                                                batch_size, l_tool_tr_samples),
                steps_per_epoch=len(train_data) // batch_size,
                epochs=optimize_n_epochs,
                callbacks=[early_stopping],
                validation_data=(test_data, test_labels),
                verbose=2,
                shuffle=True)
            return {
                'loss': model_fit.history["val_loss"][-1],
                'status': STATUS_OK,
                'model': model
            }

        # minimize the objective function using the set of parameters above
        trials = Trials()
        learned_params = fmin(create_model,
                              params,
                              trials=trials,
                              algo=tpe.suggest,
                              max_evals=int(config["max_evals"]))
        best_model = trials.results[np.argmin(
            [r['loss'] for r in trials.results])]['model']
        # set the best params with respective values
        for item in learned_params:
            item_val = learned_params[item]
            best_model_params[item] = item_val
        return best_model_params, best_model
Esempio n. 27
0
    def run(self):
        Atrails = Trials()
        Ctrails = Trials()
        trails = Trials()

        if self.imb == 'SMOTE':
            imbdefault_value = {'SMOTE_k_neighbors': 5}
            imbparamSpace = {
                'SMOTE_k_neighbors': hp.choice('SMOTE_k_neighbors',
                                               range(1, 50))
            }
        if self.imb == 'ENN':
            imbdefault_value = {'ENN_n_neighbors': 3}
            imbparamSpace = {
                'ENN_n_neighbors': hp.choice('ENN_n_neighbors', range(1, 50)),
                'kind_sel': hp.choice('kind_sel', ['all', 'mode'])
            }

        if self.adaptation == 'TCA':
            adptdefault_value = {
                #                 'kernel_type': 'linear',
                'dim': 5,
                'lamb': 1,
                'gamma': 1
            }
            adptparamSpace = {
                #                 'kernel_type': hp.choice('kernel_type', ['primal', 'linear', 'rbf', 'sam']),
                'dim':
                hp.choice('dim',
                          range(5, max(self.sx.shape[1], self.tx.shape[1]))),
                'lamb':
                hp.uniform('lamb', 1e-6, 1e2),
                'gamma':
                hp.uniform('gamma', 1e-5, 1e2)
            }

        if self.adaptation == 'DBSCANfilter':
            adptdefault_value = {'eps': 1, 'min_samples': 10}
            adptparamSpace = {
                'eps': hp.uniform('eps', 0.1, 1e2),
                'min_samples': hp.choice('min_samples', range(1, 100))
            }

        if self.adaptation == 'Burakfilter':
            adptdefault_value = {'n_neighbors': 10}
            adptparamSpace = {
                'n_neighbors': hp.choice('n_neighbors', range(1, 100))
            }

        if self.adaptation == 'Peterfilter':
            adptdefault_value = {'eachCluster': 10}
            adptparamSpace = {
                'eachCluster': hp.choice('eachCluster', range(1, 100))
            }

        if self.adaptation == 'Universal':
            adptdefault_value = {'pvalue': 0.05, 'QuantifyType': 'cliff'}
            adptparamSpace = {
                'pvalue': hp.uniform('pvalue', 0.01, 0.1),
                'QuantifyType': hp.choice('QuantifyType', ['cliff', 'cohen'])
            }

        if self.adaptation == 'DTB':
            adptdefault_value = {'DTBneighbors': 10, 'DTBT': 20}
            adptparamSpace = {
                'DTBneighbors': hp.choice('DTBneighbors', range(1, 50)),
                'DTBT': hp.choice('DTBT', range(5, 30))
            }

        if self.adaptation == 'DS':
            adptdefault_value = {'DStopn': 5, 'DSfss': 0.2}
            adptparamSpace = {
                'DStopn': hp.choice('DStopn', range(1, 15)),
                'DSfss': hp.uniform('DSfss', 0.1, 0.5)
            }

        if self.adaptation == 'DSBF':
            adptdefault_value = {'DSBFtopk': 1, 'DSBFneighbors': 10}
            adptparamSpace = {
                'DSBFtopk': hp.choice('DSBFtopk', range(1, 10)),
                'DSBFneighbors': hp.choice('DSBFneighbors', range(1, 100))
            }

        if self.clf == 'Boost':
            clfdefault_value = {'Boostnestimator': 50, 'BoostLearnrate': 1}
            clfparamSpace = {
                'Boostnestimator': hp.choice('Boostnestimator',
                                             range(10, 1000)),
                'BoostLearnrate': hp.uniform('BoostLearnrate', 0.01, 10)
            }

        if self.clf == 'RF':
            clfdefault_value = {
                'n_estimators': 10,
                'criterion': 'gini',
                'max_features': 'auto',
                'RFmin_samples_split': 2,
                'RFclass_weight': None
            }
            clfparamSpace = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 100)),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10))),
                'RFclass_weight':
                hp.choice('RFclass_weight',
                          ['balanced', 'balanced_subsample', None])
            }

        if self.clf == 'BRF':
            clfdefault_value = {
                'n_estimators': 10,
                'criterion': 'gini',
                'max_features': 'auto',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 100)),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        if self.clf == 'SVM':
            clfdefault_value = {
                'SVCkernel': {
                    'kernel': 'poly',
                    'degree': 3,
                    'polycoef0': 0.0,
                    'polygamma': 1
                },
                'C': 1.0
            }
            clfparamSpace = {
                'SVCkernel':
                hp.choice(
                    'SVCkernel',
                    [{
                        'kernel': 'linear'
                    }, {
                        'kernel': 'poly',
                        'degree': hp.choice('degree', range(1, 5)),
                        'polycoef0': hp.uniform('polycoef0', 0, 10),
                        'polygamma': hp.choice('polygamma', ["auto", "scale"])
                    }, {
                        'kernel': 'sigmoid',
                        'sigcoef0': hp.uniform('sigcoef0', 0, 10),
                        'siggamma': hp.choice('siggamma', ["auto", "scale"])
                    }, {
                        'kernel': 'rbf',
                        'rbfgamma': hp.choice('rbfgamma', ["auto", "scale"])
                    }]),
                'C':
                hp.uniform('C', 0.001, 1000),
            }

        if self.clf == 'NN':
            clfdefault_value = {
                'NNactive': 'relu',
                'NNalpha': 0.0001,
                'NNmaxiter': 200
            }
            clfparamSpace = {
                'NNactive':
                hp.choice('NNactive',
                          ['identity', 'logistic', 'tanh', 'relu']),
                'NNalpha':
                hp.uniform('NNalpha', 1e-6, 1),
                'NNmaxiter':
                hp.choice('NNmaxiter', range(100, 1000))
            }

        if self.clf == 'KNN':
            clfdefault_value = {'KNNneighbors': 1}
            clfparamSpace = {
                'KNNneighbors': hp.choice('KNNneighbors', range(1, 50))
            }

        if self.clf == 'NB':
            clfdefault_value = {'NBType': 'gaussian'}
            clfparamSpace = {
                'NBType':
                hp.choice('NBType', ['gaussian', 'multinomial', 'bernoulli'])
            }

        if self.clf == 'Ridge':
            clfdefault_value = {'Ridgealpha': 1, 'Ridgenormalize': False}
            clfparamSpace = {
                'Ridgealpha': hp.uniform('Ridgealpha', 0.001, 1000),
                'Ridgenormalize': hp.choice('Ridgenormlize', [True, False])
            }

        if self.clf == 'CART':
            clfdefault_value = {
                'criterion': 'gini',
                'max_features': 'auto',
                'CARTsplitter': 'best',
                'RFmin_samples_split': 2
            }
            clfparamSpace = {
                'criterion':
                hp.choice('criterion', ['gini', 'entropy']),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'CARTsplitter':
                hp.choice('CARTsplitter', ['best', 'random']),
                'RFmin_samples_split':
                hp.choice('RFmin_samples_split',
                          range(2, int(len(self.sy) / 10)))
            }

        if self.imb == 'NoImb':
            default_value = dict(adptdefault_value, **clfdefault_value)
        else:
            default_value = dict(adptdefault_value, **clfdefault_value,
                                 **imbdefault_value)

        self.def_value = self.objFunc(default_value)['result']
        self.Imbbest = fmin(self.objFunc,
                            space=imbparamSpace,
                            algo=tpe.suggest,
                            max_evals=int(self.fe * 0.33),
                            trials=self.Atrails)
        self.Imbbest = space_eval(imbparamSpace, self.Imbbest)

        his = dict()
        try:
            his['name'] = list(self.Atrails.trials[0]['misc']['vals'].keys()) + list(adptdefault_value.keys()) \
                          + list(clfdefault_value.keys())
        except:
            his['name'] = [None]
        i = 0
        for item in self.Atrails.trials:
            if item['state'] == 2:
                results = list(deepflatten(item['misc']['vals'].values())) + list(adptdefault_value.values()) \
                          + list(clfdefault_value.keys())
                results.append(item['result']['result'])
                his[i] = results
                i += 1

        # optimize the parameters of the classifer
        self.SEQ = 1
        self.Adptbest = fmin(self.objFunc,
                             space=adptparamSpace,
                             algo=tpe.suggest,
                             max_evals=int(self.fe * 0.33),
                             trials=self.Ctrails)

        try:
            his['name1'] = list(self.Imbbest.keys()) + list(self.Ctrails.trials[0]['misc']['vals'].keys()) \
                           + list(clfdefault_value.keys())
        except:
            his['name1'] = [None]
        for item in self.Ctrails.trials:
            if item['state'] == 2:
                results = list(self.Imbbest.values()) + list(deepflatten(item['misc']['vals'].values())) \
                          + list(clfdefault_value.keys())
                results.append(item['result']['result'])
                his[i] = results
                i += 1

        #opt the parameter of sampling method for imbalance learning
        self.SEQ = 2
        Clfbest = fmin(self.objFunc,
                       space=clfparamSpace,
                       algo=tpe.suggest,
                       max_evals=int(self.fe * 0.33),
                       trials=self.trails)

        try:
            his['name2'] = list(self.Imbbest.keys()) + list(
                self.Adptbest.keys()) + list(
                    self.trails.trials[0]['misc']['vals'].keys())
        except:
            his['name2'] = [None]
        for item in self.trails.trials:
            if item['state'] == 2:
                results = list(self.Imbbest.values()) + list(
                    self.Adptbest.keys()) + list(
                        deepflatten(item['misc']['vals'].values()))
                results.append(item['result']['result'])
                his[i] = results
                i += 1

        inc_value = self.trails.best_trial['result']['result']

        return np.asarray([self.def_value, inc_value]), his, Clfbest
Esempio n. 28
0
    def get_params(self, a=1):
        if self.obj == 'binary':
            space = {
                'bagging_freq':
                hp.choice('bagging_freq', np.arange(1, 5, dtype=int)),
                'bagging_fraction':
                hp.uniform('bagging_fraction', 0.20, 0.90),
                'boost':
                hp.choice('boost', ['gbdt']),
                'feature_fraction':
                hp.uniform('feature_fraction', 0.20, 0.90),
                'learning_rate':
                hp.loguniform('learning_rate', np.log(0.0070), np.log(0.010)),
                'min_data_in_leaf':
                hp.choice('min_data_in_leaf', np.arange(50, 90, dtype=int)),
                'num_leaves':
                hp.choice('num_leaves', np.arange(5, 35, dtype=int)),
                'min_sum_hessian_in_leaf':
                hp.choice('min_sum_hessian_in_leaf', np.arange(5,
                                                               35,
                                                               dtype=int)),
                'max_depth':
                hp.choice('max_depth', np.arange(-2, 2, dtype=int)),
                'tree_learner':
                hp.choice('tree_learner', ['serial']),
                'objective':
                hp.choice('objective', ['binary']),
                'boost_from_average':
                hp.choice('boost_from_average', [False]),
                'num_threads':
                hp.choice('num_threads', np.arange(8, 9, dtype=int)),
                'verbosity':
                hp.choice('verbosity', np.arange(1, 2, dtype=int))
            }

        else:
            space = {
                'num_leaves':
                hp.choice('num_leaves', np.arange(5, 35, dtype=int)),
                'learning_rate':
                hp.loguniform('learning_rate', np.log(0.0070), np.log(0.010)),
                'max_depth':
                hp.choice('max_depth', np.arange(-2, 2, dtype=int)),
                'colsample_bytree':
                hp.uniform('colsample_bytree', 0.1, 0.9)
            }

        new_tpe = tpe.suggest
        new_trial = Trials()

        global ITERATION
        ITERATION = 0

        best = fmin(fn=self.objective,
                    space=space,
                    algo=new_tpe,
                    max_evals=self.m_eval,
                    trials=new_trial,
                    rstate=np.random.RandomState(50))

        bayes_trials_results = sorted(new_trial.results,
                                      key=lambda x: x['loss'])
        params = bayes_trials_results[:1]
        print('*' * 40)
        print('Best Params\n:', bayes_trials_results[:1])

        params = params[0]['params']

        return params
Esempio n. 29
0
def run_hyperopt(major_model,
                 specific_model,
                 training_params,
                 sampled_x,
                 sampled_y,
                 x_to_predict,
                 eval_metric=mse,
                 train_test_inds=None,
                 hyperopt_rounds=100):
    """
    Executes hyperparameter optimization for a given inbuilt model
    
    Parameters
    ----------
    major_model: str
        Choice of 'Keras', 'XGB', or 'sklearn-regressor'. This argument
        tells MldeModel from which package we will be pulling models. 
    specific_model: str
        This arguments tells MldeModel which regressor to use within the package
        defined by major_model.    
    training_params: dict
        These are parameters required for training the models specified by 
        'major_model' and 'specific_model'. Details on the requirements for each
        submodel can be found in the online documentation.
    sampled_x: numpy array
        Training features
    sampled_y: numpy array, 1D
        Training labels
    x_to_predict: numpy array
        Features for which we want to predict labels
    eval_metric: func
        The function used for evaluating cross-validation error. This metric will
        be used to rank model architectures from best to worst. The function must
        take the form 'function(real_values, predicted_values)'.
    train_test_inds: list of lists
        Cross validation indices to use in training.
    hyperopt_rounds: int
        Number of rounds of hyperparameter optimization to perform
        
    Returns
    -------
    all_trial_info: pd.DataFrame
        The results of "process_trials" post hyperparameter optimization
    train_pred_results: tuple
        The results of MLDE.Support.RunMlde.TrainAndPredict.train_and_predict()
        using the best parameters identified during hyperparameter optimization
    """
    # Get the shape of x
    x_shape = sampled_x.shape

    # Build the search space
    space_var_names = list(space_by_model[major_model][specific_model])
    search_space = [
        search_spaces[major_model][space_var] for space_var in space_var_names
    ]

    # Construct a dictionary for passing in kwargs to Optimize
    optimizer_kwargs = {
        "space_names": space_var_names,
        "x": sampled_x,
        "y": sampled_y,
        "major_model": major_model,
        "specific_model": specific_model,
        "training_params": training_params,
        "eval_metric": eval_metric,
        "train_test_inds": train_test_inds
    }

    # Construct the optimizer function
    complete_optimizer = partial(optimize, **optimizer_kwargs)

    # Build the trials object
    trials = Trials()

    # Run hyperparameter optimization
    best_params = fmin(complete_optimizer,
                       space=search_space,
                       algo=tpe.suggest,
                       max_evals=hyperopt_rounds,
                       trials=trials,
                       show_progressbar=False)

    # Reformat best_params to have the correct datatypes
    best_params = process_best(best_params, major_model, specific_model,
                               x_shape)

    # Process the trials
    all_trial_info = process_trials(trials, major_model, specific_model)

    # Now build the model using the best parameters
    best_model = MldeModel(major_model,
                           specific_model,
                           model_params=best_params,
                           training_params=training_params,
                           eval_metric=eval_metric)

    # Train and predict using the best model
    train_pred_results = train_and_predict(best_model,
                                           sampled_x=sampled_x,
                                           sampled_y=sampled_y,
                                           x_to_predict=x_to_predict,
                                           train_test_inds=train_test_inds)

    # Return all relevant information
    return all_trial_info, train_pred_results
Esempio n. 30
0
    def work(self):

        bandit = self.bandit
        assert bandit.name is not None
        algo = partial(tpe.suggest,
                gamma=self.gammas.get(bandit.name,
                    tpe._default_gamma),
                prior_weight=self.prior_weights.get(bandit.name,
                    tpe._default_prior_weight),
                n_EI_candidates=self.n_EIs.get(bandit.name,
                    tpe._default_n_EI_candidates),
                )
        LEN = self.LEN.get(bandit.name, 50)

        trials = Trials()
        fmin(passthrough,
            space=bandit.expr,
            algo=algo,
            trials=trials,
            max_evals=LEN,
            rstate=np.random.RandomState(123),
            catch_eval_exceptions=False)
        assert len(trials) == LEN

        if 1:
            rtrials = Trials()
            fmin(passthrough,
                space=bandit.expr,
                algo=rand.suggest,
                trials=rtrials,
                max_evals=LEN)
            print 'RANDOM MINS', list(sorted(rtrials.losses()))[:6]
            #logx = np.log([s['x'] for s in rtrials.specs])
            #print 'RND MEAN', np.mean(logx)
            #print 'RND STD ', np.std(logx)

        if 0:
            plt.subplot(2, 2, 1)
            plt.scatter(range(LEN), trials.losses())
            plt.title('TPE losses')
            plt.subplot(2, 2, 2)
            plt.scatter(range(LEN), ([s['x'] for s in trials.specs]))
            plt.title('TPE x')
            plt.subplot(2, 2, 3)
            plt.title('RND losses')
            plt.scatter(range(LEN), rtrials.losses())
            plt.subplot(2, 2, 4)
            plt.title('RND x')
            plt.scatter(range(LEN), ([s['x'] for s in rtrials.specs]))
            plt.show()
        if 0:
            plt.hist(
                    [t['x'] for t in self.experiment.trials],
                    bins=20)

        #print trials.losses()
        print 'TPE    MINS', list(sorted(trials.losses()))[:6]
        #logx = np.log([s['x'] for s in trials.specs])
        #print 'TPE MEAN', np.mean(logx)
        #print 'TPE STD ', np.std(logx)
        thresh = self.thresholds[bandit.name]
        print 'Thresh', thresh
        assert min(trials.losses()) < thresh
Esempio n. 31
0
def main():
    n_folds = 5
    try:
        opts, args = getopt.getopt(sys.argv[1:], '', [
            'window_size=', 'wiki=', 'n_feature_maps=', 'epochs=',
            'undersample=', 'n_feature_maps=', 'criterion=', 'optimizer=',
            'model=', 'genia=', 'tacc=', 'layers=', 'hyperopt=', 'model_name='
        ])
    except getopt.GetoptError as error:
        print error
        sys.exit(2)
    model_type = 'nn'
    window_size = 5
    wiki = True
    n_feature_maps = 100
    epochs = 20
    undersample = False
    binary_cross_entropy = False
    criterion = 'categorical_crossentropy'
    optimizer = 'adam'
    k = 2
    use_genia = False
    using_tacc = False
    layer_sizes = []
    hyperopt = False
    model_name = 'model'

    for opt, arg in opts:
        if opt == '--window_size':
            window_size = int(arg)
        elif opt == '--wiki':
            if arg == 0:
                wiki = False
        elif opt == '--epochs':
            epochs = int(arg)
        elif opt == '--layers':
            layer_sizes = arg.split(',')
        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--undersample':
            option = int(arg)

            if option == 1:
                undersample = True

        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--criterion':
            criterion = arg
        elif opt == '--optimizer':
            optimizer = arg
        elif opt == '--model':
            model_type = arg
        elif opt == '--genia':
            if int(arg) == 1:
                use_genia = True
        elif opt == '--tacc':
            if int(arg) == 1:
                using_tacc = True
        elif opt == '--hyperopt':
            if int(arg) == 1:
                hyperopt = True
        elif opt == '--model_name':
            model_name = arg
        else:
            print "Option {} is not valid!".format(opt)

    if criterion == 'binary_crossentropy':
        binary_cross_entropy = True
        k = 1

    print('Loading word2vec model...')

    if wiki:
        print 'Using wiki word2vec...'
        word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin'
    else:
        print 'Using non-wiki word2vec...'
        word2vec_model = 'PubMed-w2v.bin'
    w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
    print('Loaded word2vec model')

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True, use_genia=use_genia, using_tacc=using_tacc)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)

    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    aucs = []

    global model

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]

        print train_pmids
        print('loading data...')

        if model_type == 'cnn':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)
        elif model_type == 'nn':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)
        elif model_type == 'ladder':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)

        if undersample:
            # Undersample the non group tags at random....probably a bad idea...
            if binary_cross_entropy:
                idx_undersample = numpy.where(y_train == 0)[0]
                idx_postive = numpy.where(y_train == 1)[0]
            else:
                idx_undersample = numpy.where(y_train[:, 1] == 0)[0]
                idx_postive = numpy.where(y_train[:, 1] == 1)[0]
            random_negative_sample = numpy.random.choice(
                idx_undersample, idx_postive.shape[0])

            if model_type == 'nn':
                X_train_postive = X_train[idx_postive, :]
                X_train_negative = X_train[random_negative_sample, :]
            else:
                X_train_postive = X_train[idx_postive, :, :, :]

                X_train_negative = X_train[random_negative_sample, :, :, :]

            if binary_cross_entropy:
                y_train_postive = y_train[idx_postive]
                y_train_negative = y_train[random_negative_sample]
            else:
                y_train_postive = y_train[idx_postive, :]
                y_train_negative = y_train[random_negative_sample, :]

            X_train = numpy.vstack((X_train_postive, X_train_negative))

            if binary_cross_entropy:
                y_train = numpy.hstack((y_train_postive, y_train_negative))

            else:
                y_train = numpy.vstack((y_train_postive, y_train_negative))

        print('loaded data...')

        if model_type == 'cnn':
            model = GroupCNN(window_size=window_size,
                             n_feature_maps=n_feature_maps,
                             k_output=k,
                             name=model_name)
        elif model_type == 'nn':
            model = GroupNN(window_size=window_size,
                            k=k,
                            hyperparameter_search=hyperopt,
                            name=model_name)

        if hyperopt:
            best_run, best_model = optim.minimize(model=_model,
                                                  data=_data,
                                                  algo=tpe.suggest,
                                                  max_evals=5,
                                                  trials=Trials())
            model.model = best_model

        else:
            model.train(X_train,
                        y_train,
                        epochs,
                        optim_algo=optimizer,
                        criterion=criterion)

        words = []
        for pmid in test_pmids:
            words.extend(pmids_dict[pmid][0])

        predictions = model.predict_classes(X_test)

        predicted_words = crf.output2words(predictions, words)
        y_test_arg_max = numpy.argmax(y_test, axis=1)
        true_words = crf.output2words(y_test_arg_max, words)

        accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test)
        recall, precision, f1_score = crf.eveluate(predicted_words, true_words)

        print "Accuracy: {}".format(accuracy)
        print "F1: {}".format(f1_score)
        print "Precision: {}".format(precision)
        print "AUC: {}".format(auc)
        print "Recall: {}".format(recall)

        accuracies.append(accuracy)
        f1_scores.append(f1_score)
        precisions.append(precision)
        aucs.append(auc)
        recalls.append(recall)
    mean_accuracy = numpy.mean(accuracies)
    mean_f1_score = numpy.mean(f1_scores)
    mean_precision = numpy.mean(precisions)
    mean_auc_score = numpy.mean(aucs)
    mean_recall = numpy.mean(recalls)

    mean_accuracy_string = "Mean Accuracy: {}".format(mean_accuracy)
    mean_f1_score_string = "Mean F1: {}".format(mean_f1_score)
    mean_precision_string = "Mean Precision: {}".format(mean_precision)
    mean_auc_score_string = "Mean AUC: {}".format(mean_auc_score)
    mean_recall_string = "Mean Recall: {}".format(mean_recall)

    print mean_accuracy_string
    print mean_f1_score_string
    print mean_precision_string
    print mean_auc_score_string
    print mean_recall_string

    results = open('{}_fold_results'.format(model.model_name), 'w+')
    results.write(mean_accuracy_string)
    results.write(mean_f1_score_string)
    results.write(mean_precision_string)
    results.write(mean_auc_score_string)
    results.write(mean_recall_string)
        log_handler = open(log_file, 'w')
        writer = csv.writer(log_handler)
        headers = [
            'trial_counter', 'log_loss_mean', 'log_loss_std', 'spend_time'
        ]
        for k, v in sorted(param_space.items()):
            headers.append(k)
        print(headers)
        writer.writerow(headers)
        log_handler.flush()

        print("************************************************************")
        print("Search for the best params")
        # global trial_counter
        trial_counter = 0
        trials = Trials()
        # lambda在这一步并不会运行,只是定义一个函数而已
        objective = lambda p: hyperopt_wrapper(p, feat_folder, feat_name)
        # objective放到fmin中,会被调用,且传进三个参数
        best_params = fmin(objective,
                           param_space,
                           algo=tpe.suggest,
                           trials=trials,
                           max_evals=param_space["max_evals"])
        for f in int_feat:
            if f in best_params:
                best_params[f] = int(best_params[f])
        print("************************************************************")
        print("Best params")
        for k, v in best_params.items():
            print("        %s: %s" % (k, v))
Esempio n. 33
0
        'objective':
        'reg:linear',
        'booster':
        'gbtree',
        'silent':
        1,
        'seed':
        random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, trials=trials,
                max_evals=25)  #more evals
    return best


trials = Trials()  #store history of search
best = optimize(trials)
print("The best hyperparameters are: ", "\n")
print(best)
sys.stdout.flush()

#add here: train final model
print('training model')

rf1 = MultiOutputRegressor(
    XGBRegressor(max_depth=best['max_depth'],
                 n_estimators=int(best['n_estimators']),
                 random_state=123,
                 n_jobs=-1,
                 silent=False,
                 colsample_bytree=best['colsample_bytree'],
Esempio n. 34
0
def run_hyperopt_classification(
    model_name,
    model_space,
    x_train,
    y_train,
    scoring="f1",
    cv=3,
    max_evals=20,
    verbose=False,
    persistIterations=True,
):
    print("LABDAPS --- Running Hyperopt Bayesian Optimization")
    print("reloaded")

    import pandas as pd
    import time
    import datetime
    from hyperopt import fmin, tpe, Trials, space_eval
    from sklearn.model_selection import cross_val_score

    def objective(space):
        ### MODEL SELECTION

        if model_name == "lr":
            # logistic regression
            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression(**space)

        elif model_name == "rf":
            # print("Setting model as RandomForestClassifier")
            from sklearn.ensemble import RandomForestClassifier

            model = RandomForestClassifier(**space, n_jobs=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "xgb":
            # print("Setting model as XGBClassifier")
            from xgboost import XGBClassifier

            model = XGBClassifier(**space,
                                  objective="binary:logistic",
                                  nthread=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "dt":
            # print("Setting model as DecisionTreeClassifier")
            from sklearn.tree import DecisionTreeClassifier

            model = DecisionTreeClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "catboost":
            # print("Setting model as CatBoost")
            from catboost import CatBoostClassifier

            model = CatBoostClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "extratrees":
            # print("Setting model as CatBoost")
            from sklearn.ensemble import ExtraTreesClassifier

            model = ExtraTreesClassifier(**space, n_jobs=-1)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "svc":
            from sklearn.svm import SVC

            model = SVC(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "ann":
            # print("Setting model as ANN")
            from sklearn import neural_network

            model = neural_network.MLPClassifier(**space)
            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "lgb":
            import lightgbm as lgb

            model = lgb.LGBMClassifier(**space, n_jobs=-1, random_state=42)

            if verbose:
                print("Hyperparameters: ", space)

        elif model_name == "knn":
            from sklearn.neighbors import KNeighborsClassifier

            model = KNeighborsClassifier(**space)

            if verbose:
                print("Hyperparameters: ", space)

        else:
            # print("ERRO: Especifique um nome valido para model_name: rf, xgb, dt ou catboost")
            raise Exception(
                "Invalid model_name - Please specify one of the supported model_name: rf, xgb, ann, dt, svc, lgr, knn or catboost"
            )
        score = cross_val_score(model,
                                x_train,
                                y_train,
                                cv=3,
                                scoring=scoring,
                                verbose=False,
                                n_jobs=-1).mean()
        score = 1 - score  ## ajusta para a funcao de minimizacao.

        return score

    start = time.time()
    trials = Trials()
    best = fmin(
        objective,
        space=model_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials,
    )

    if persistIterations:
        # Save the hyperparameter at each iteration to a csv file
        param_values = [x["misc"]["vals"] for x in trials.trials]
        param_values = [{key: value
                         for key in x for value in x[key]}
                        for x in param_values]
        param_values = [space_eval(model_space, x) for x in param_values]

        param_df = pd.DataFrame(param_values)
        param_df[scoring] = [1 - x for x in trials.losses()]
        param_df.index.name = "Iteration"
        ts = time.time()
        st = datetime.datetime.fromtimestamp(ts).strftime("%Y%m%d-%H:%M")
        param_df.to_csv(f"Hyperopt_{model_name}_{st}.csv")
        print(f"Arquivo Hyperopt_{model_name}_{st}.csv gerado com sucesso.")

    print(f"Hyperopt search took %.2f seconds for {max_evals} candidates" %
          ((time.time() - start)))
    # print(-best_score, best)
    print("** Best Hyperparameters are: **")
    print(space_eval(model_space, best))
        print('Best validation acc of epoch:', validation_acc)
        return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':
    # config TF-----------------------------------------------------------------------------------------------------
    CUDA, max_eval = sys.argv[1:]
    os.environ['CUDA_VISIBLE_DEVICES'] = CUDA
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # x_train, y_train, x_test, y_test = data()
    # Conv2DClassifierIn1(x_train, y_train, x_test, y_test)


    best_run, best_model = optim.minimize(model=Conv2DClassifierIn1,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=int(max_eval),
                                          keep_temp=False,
                                          trials=Trials())
    for trial in Trials():
        print(trial)
    X_train, Y_train, X_test, Y_test = data()
    print("Evalutation of best performing model:")
    print(best_model.evaluate(X_test, Y_test))
    print("Best performing model chosen hyper-parameters:")
    print(best_run)
Esempio n. 36
0
    coords = get_gene_coords(geneannot, gene)
    gene_name = get_gene_name(geneannot, gene)
    expr_vec = expr_df[gene]

    adj_exp = adjust_for_covariates(list(expr_vec), cov)
    cis_gt = get_cis_genotype(gt_df, snpannot, coords)

    #build the model

    if (type(cis_gt) != int) & (cis_gt.shape[1] > 0):

        x = cis_gt.values
        y = adj_exp.ravel()

        #KNN
        trials = Trials()  #reset the trials object
        best = fmin(fn=objective,
                    space=knn_space,
                    algo=algo,
                    max_evals=max_evals,
                    trials=trials)
        result_table = pd.DataFrame(trials.results)
        best_hyperparam = hyperopt.space_eval(knn_space, best)
        best_hyperparam.pop("type")  #just to remove "type" from the param dict

        open(output + pop + "_knn_hyperopt_chr" + chrom + ".txt",
             "a").write("\n" + gene + "\t" + gene_name + "\t" + chrom + "\t" +
                        str(best_hyperparam) + "\t")

        for i in range(0, max_evals,
                       1):  #I negate the loss in order to get cvR2
Esempio n. 37
0
def optimize(X,
             y,
             lname,
             regression=True,
             hyperopt_step=100,
             arch=False,
             epochs=1000,
             X_val=False,
             y_val=False,
             model=False):
    '''
    Model hyperparameter optimization.
    
    Parameters
    ---
        X: np.array, features
        y: np.array, targets
        lname: str, name of the target property
        regression: boolean, whether it is a regression task
        hyperopt_step: int, number of steps for hyperopt
        arch: boolean or list, whether to use a fix architecture
        epochs: int, maximum epochs during the model training
        X_val: boolean or np.array, validation features
        y_val: boolean or np.array, validation targets
        model: boolean for keras.model, can start with a input model instead of building from scratch.
    '''
    np.random.seed(1234)
    if arch == False:
        architectures = [(128, 128), (256, 256), (512, 512), (128, 128, 128),
                         (256, 256, 256), (512, 512, 512)]
    else:
        architectures = [arch]
    bzs = [16, 32, 64, 128, 256, 512]
    ress = [True, False]
    bypasses = [True, False]
    if not model == False:
        space = {
            'lr': hp.uniform('lr', 1e-5, 1e-3),
            'batch_size': hp.choice('batch_size', bzs),
            'beta_1': hp.uniform('beta_1', 0.75, 0.99),
            'decay': hp.loguniform('decay', np.log(1e-5), np.log(1e-1)),
            'amsgrad': True,
            'patience': 50,
        }
    else:
        space = {
            'lr': hp.uniform('lr', 1e-5, 1e-3),
            'drop_rate': hp.uniform('drop_rate', 0, 0.5),
            'reg': hp.loguniform('reg', np.log(1e-5), np.log(5e-1)),
            'batch_size': hp.choice('batch_size', bzs),
            'hidden_size': hp.choice('hidden_size', architectures),
            'beta_1': hp.uniform('beta_1', 0.75, 0.99),
            'decay': hp.loguniform('decay', np.log(1e-5), np.log(1e-1)),
            'res': hp.choice('res', ress),
            'bypass': hp.choice('bypass', bypasses),
            'amsgrad': True,
            'patience': 50,
        }
    objective_func = partial(train_model_hyperopt,
                             X=X,
                             y=y,
                             lname=lname,
                             regression=regression,
                             epochs=epochs,
                             X_val=X_val,
                             y_val=y_val,
                             input_model=model)
    trials = Trials()
    best_params = fmin(objective_func,
                       space,
                       algo=tpe.suggest,
                       trials=trials,
                       max_evals=hyperopt_step,
                       rstate=np.random.RandomState(0))
    if not model == False:
        best_params.update({
            'batch_size': bzs[best_params['batch_size']],
            'amsgrad': True,
            'patience': 10,
        })
    else:
        best_params.update({
            'hidden_size':
            architectures[best_params['hidden_size']],
            'batch_size':
            bzs[best_params['batch_size']],
            'res':
            ress[best_params['res']],
            'bypass':
            bypasses[best_params['bypass']],
            'amsgrad':
            True,
            'patience':
            10,
        })
    # One extra model training on train/validation set to get the number of epoch for the final model training.
    returned = train_model_hyperopt(best_params,
                                    X,
                                    y,
                                    lname,
                                    regression=regression,
                                    epochs=epochs,
                                    X_val=X_val,
                                    y_val=y_val,
                                    input_model=model)
    best_params.update({'epochs': returned['epochs']})
    return best_params
Esempio n. 38
0
    eval_set = [(X_train, y_train), (X_test, y_test)]

    clf.fit(X_train,
            y_train,
            eval_set=eval_set,
            eval_metric="rmse",
            verbose=False)

    y_pred = clf.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**(0.5)

    return {'loss': rmse, 'status': STATUS_OK}


trials_reg = Trials()
best = fmin(fn=objetivo,
            space=space,
            algo=tpe.suggest,
            max_evals=1000,
            trials=trials_reg)
print(best)

modelo = xgb.XGBRegressor(n_estimators=int(best['n_estimators']),
                          x_gamma=best['x_gamma'],
                          learning_rate=best['learning_rate'],
                          x_max_depth=best['x_max_depth'],
                          x_min_child=best['x_min_child'],
                          x_reg_lambda=best['x_reg_lambda'],
                          x_subsample=best['x_subsample'],
                          objective='reg:squarederror')
Esempio n. 39
0
    parser.add_argument("--early_stopping_rounds", default=10, type=int)
    parser.add_argument("--seed", default=0, type=int)
    parser.add_argument("--nfold", default=10, type=int)
    cv_params = vars(parser.parse_args())

    parser_2 = argparse.ArgumentParser(
        description='Others',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser_2.add_argument("--max_evals", default=2, type=int)
    parser_2.add_argument("--filename", default='tmp', type=str)
    parser_2.add_argument("--n_stack", default=1, type=int)
    args = parser_2.parse_args()

    # load and combine parameters
    params = Params(cv_params=cv_params)
    params.lgb_params()

    # load dataset
    train, train_label = load_dataset(args.n_stack)

    bayes_trials_1 = Trials()
    obj = HPOpt_cv(train, train_label)
    lgb_opt = obj.process(fn_name='lgb_cv',
                          space=params.get_param(),
                          trials=bayes_trials_1,
                          algo=tpe.suggest,
                          max_evals=args.max_evals)

    # save trial
    save_obj(bayes_trials_1, args.filename)
    # tmp = load_obj('tmp')
Esempio n. 40
0
    def run_train_hopt(self):
        logger.info("start training with hopt")
        skf = StratifiedKFold(n_splits=self._best_cv,
                              shuffle=True,
                              random_state=3655)
        sample_weight = self._calc_w(self.y)

        def score(params, skf=skf, sample_weight=sample_weight):
            prms = {
                "C": params["C"],
                "gamma": params["gamma"],
                "kernel": params['kernel']
            }

            list_score_acc = []
            list_score_logloss = []

            for train, val in skf.split(self.X, self.y):
                X_train, X_val = self.X[train], self.X[val]
                y_train, y_val = self.y[train], self.y[val]

                weight_train = sample_weight[train]
                weight_val = sample_weight[val]

                model = self._set_model()
                model.train(prms=prms,
                            X_tr=X_train,
                            X_val=X_val,
                            y_tr=y_train,
                            y_val=y_val,
                            w_tr=weight_train,
                            w_val=weight_val)

                list_score_acc.append(model._score_acc)
                list_score_logloss.append(model._score_logloss)
                """
                ##n_estimaters=0 causes error at .fit()
                if model.best_iteration_ != -1:
                    list_best_iter.append(model.best_iteration_)
                else:
                    list_best_iter.append(params['n_estimators'])
                break
                """
            # logger.info("n_estimators: {}".format(list_best_iter))
            # params["n_estimators"] = np.mean(list_best_iter, dtype=int)

            score_acc = (np.mean(list_score_acc), np.min(list_score_acc),
                         np.max(list_score_acc))
            # logger.info("score_acc %s" % np.mean(list_score_acc))

            # score_logloss = (np.mean(list_score_logloss), np.min(list_score_logloss), np.max(list_score_logloss))
            # score_f1 = (np.mean(list_score_f1), np.min(list_score_f1), np.max(list_score_f1))
            # score_auc = (np.mean(list_score_auc), np.min(list_score_auc), np.max(list_score_auc))

            logloss = np.mean(list_score_logloss)
            return {
                'loss': logloss,
                'status': STATUS_OK,
                'localCV_acc': score_acc
            }

        space = {
            'C': hp.uniform('C', 0, 2),
            'gamma': hp.loguniform('gamma', -8, 2),
            'kernel': hp.choice('kernel', ['rbf', 'poly', 'sigmoid'])
        }

        trials = Trials()
        best_params = fmin(rstate=hopt_random_state,
                           fn=score,
                           space=space,
                           algo=tpe.suggest,
                           trials=trials,
                           max_evals=30)
        self.localCV_acc = list(
            filter(lambda x: x["loss"] == min(trials.losses()),
                   trials.results))[0]["localCV_acc"][0]
        self.localCV_loss = min(trials.losses())
        logger.info("localCV_acc %s" % self.localCV_acc)
        logger.info("localCV_loss %s" % self.localCV_loss)

        self.best_params = {
            "C": best_params["C"],
            "gamma": best_params["gamma"],
            "kernel": best_params["kernel"]
        }

        logger.info("best params are %s" % self.best_params)

        sample_weight = self._calc_w(self.y)
        model = self._set_model()
        logger.info("start training with best params")

        model.train_all(prms=self.best_params,
                        X_tr=self.X,
                        y_tr=self.y,
                        w_tr=sample_weight)

        model.save_model()
        del model, sample_weight, skf
        gc.collect()

        logger.info("model with best params is saved")
Esempio n. 41
0
import hyperopt
from hyperopt import Trials, Domain

domain = Domain(fn = None)

docs = hyperopt.rand.suggest(range(10), domain, Trials(), seed=123)
Esempio n. 42
0
def main(args):
    train_data, val_data, test_data, label_dict, img_mean = load_image(
        args.data,
        dim=args.dim,
        mode="RGB",
        zero_center=args.zero_center,
        train_size=args.train_size,
        crop=args.crop)

    # use both val and test as val
    val_data = (np.concatenate((val_data[0], test_data[0]), axis=0),
                np.concatenate((val_data[1], test_data[1])))

    num_channels = train_data[0][0].shape[0]

    data_size = (None, num_channels, CROP_DIM, CROP_DIM)

    def objective(hyperargs):
        net = build_network(args.model, data_size, cudnn=args.dnn)
        lr = args.learning_rate
        lmbda = args.lmbda

        if 'lr' in hyperargs:
            lr = hyperargs['lr']
        if 'lmbda' in hyperargs:
            lmbda = hyperargs['lmbda']

        best_val_cost, train_costs, val_costs = net.train(
            algorithm,
            train_data,
            val_data=val_data,
            test_data=None,
            lr=lr,
            lmbda=lmbda,
            train_batch_size=TRAIN_BATCH_SIZE,
            val_batch_size=10,
            epochs=args.epoch,
            crop_dim=CROP_DIM,
            img_mean=img_mean,
            color_jitter=args.color_jitter)
        return {'loss': best_val_cost, 'status': STATUS_OK}

    space = {}
    if not args.learning_rate:
        print "Optimizing learning rate"
        space['lr'] = hp.uniform('lr', 0, 0.001)
    if not args.lmbda:
        print "Optimizing regularization rate"
        space['lmbda'] = hp.uniform('lambda', 0, 0.1)

    trials = Trials()

    best = fmin(objective,
                space,
                algo=tpe.suggest,
                max_evals=args.num_trials,
                trials=trials)

    print best
    print hyperopt.space_eval(space, best)

    f = open('out.txt', 'w')
    cPickle.dump(trials.trials, f)
    f.close()
class HyperOptimizer(object):
    def __init__(self, train_set, holdout_set, command, max_evals=100,
                 outer_loss_function='logistic',
                 searcher='tpe', is_regression=False):
        self.train_set = train_set
        self.holdout_set = holdout_set

        self.train_model = './current.model'
        self.holdout_pred = './holdout.pred'
        self.trials_output = './trials.json'
        self.hyperopt_progress_plot = './hyperopt_progress.png'
        self.log = './log.log'

        self.logger = self._configure_logger()

        # hyperopt parameter sample, converted into a string with flags
        self.param_suffix = None
        self.train_command = None
        self.validate_command = None

        self.y_true_train = []
        self.y_true_holdout = []

        self.outer_loss_function = outer_loss_function
        self.space = self._get_space(command)
        self.max_evals = max_evals
        self.searcher = searcher
        self.is_regression = is_regression

        self.trials = Trials()
        self.current_trial = 0

    def _get_space(self, command):
        hs = HyperoptSpaceConstructor(command)
        hs.string_to_pyll()
        return hs.space

    def _configure_logger(self):
        LOGGER_FORMAT = "%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s/%(module)s:%(lineno)d]: %(message)s"
        LOGGER_DATEFMT = "%Y-%m-%d %H:%M:%S"
        LOGFILE = self.log

        logging.basicConfig(format=LOGGER_FORMAT,
                            datefmt=LOGGER_DATEFMT,
                            level=logging.DEBUG)
        formatter = logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)

        file_handler = logging.FileHandler(LOGFILE)
        file_handler.setFormatter(formatter)

        logger = logging.getLogger()
        logger.addHandler(file_handler)
        return logger

    def get_hyperparam_string(self, **kwargs):
        for arg in ['--passes']: #, '--rank', '--lrq']:
            if arg in kwargs:
                kwargs[arg] = int(kwargs[arg])

        #print 'KWARGS: ', kwargs
        flags = [key for key in kwargs if key.startswith('-')]
        for flag in flags:
            if kwargs[flag] == 'omit':
                del kwargs[flag]

        self.param_suffix = ' '.join(['%s %s' % (key, kwargs[key]) for key in kwargs if key.startswith('-')])
        self.param_suffix += ' %s' % (kwargs['argument'])

    def compose_vw_train_command(self):
        data_part = ('vw -d %s -f %s --holdout_off -c '
                     % (self.train_set, self.train_model))
        self.train_command = ' '.join([data_part, self.param_suffix])

    def compose_vw_validate_command(self):
        data_part = 'vw -t -d %s -i %s -p %s --holdout_off -c' \
                    % (self.holdout_set, self.train_model, self.holdout_pred)
        self.validate_command = data_part

    def fit_vw(self):
        self.compose_vw_train_command()
        self.logger.info("executing the following command (training): %s" % self.train_command)
        subprocess.call(shlex.split(self.train_command))

    def validate_vw(self):
        self.compose_vw_validate_command()
        self.logger.info("executing the following command (validation): %s" % self.validate_command)
        subprocess.call(shlex.split(self.validate_command))

    def get_y_true_train(self):
        self.logger.info("loading true train class labels...")
        yh = open(self.train_set, 'r')
        self.y_true_train = []
        for line in yh:
            self.y_true_train.append(int(line.strip()[0:2]))
        if not self.is_regression:
            self.y_true_train = [(i + 1.) / 2 for i in self.y_true_train]
        self.logger.info("train length: %d" % len(self.y_true_train))

    def get_y_true_holdout(self):
        self.logger.info("loading true holdout class labels...")
        yh = open(self.holdout_set, 'r')
        self.y_true_holdout = []
        for line in yh:
            self.y_true_holdout.append(int(line.strip()[0:2]))
        if not self.is_regression:
            self.y_true_holdout = [(i + 1.) / 2 for i in self.y_true_holdout]
        self.logger.info("holdout length: %d" % len(self.y_true_holdout))

    def validation_metric_vw(self):
        v = open('%s' % self.holdout_pred, 'r')
        y_pred_holdout = []
        for line in v:
            y_pred_holdout.append(float(line.strip()))

        if self.outer_loss_function == 'logistic':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':  # TODO: write it
            pass

        elif self.outer_loss_function == 'hinge':  # TODO: write it
            pass

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss

    def hyperopt_search(self, parallel=False):  # TODO: implement parallel search with MongoTrials
        def objective(kwargs):
            start = dt.now()

            self.current_trial += 1
            self.logger.info('\n\nStarting trial no.%d' % self.current_trial)
            self.get_hyperparam_string(**kwargs)
            self.fit_vw()
            self.validate_vw()
            loss = self.validation_metric_vw()

            finish = dt.now()
            elapsed = finish - start
            self.logger.info("evaluation time for this step: %s" % str(elapsed))

            # clean up
            subprocess.call(shlex.split('rm %s %s' % (self.train_model, self.holdout_pred)))

            to_return = {'status': STATUS_OK,
                         'loss': loss,  # TODO: include also train loss tracking in order to prevent overfitting
                         'eval_time': elapsed.seconds,
                         'train_command': self.train_command,
                         'current_trial': self.current_trial
            }
            return to_return

        self.trials = Trials()
        if self.searcher == 'tpe':
            algo = tpe.suggest
        elif self.searcher == 'rand':
            algo = rand.suggest

        logging.debug("starting hypersearch...")
        best_params = fmin(objective, space=self.space, trials=self.trials, algo=algo, max_evals=self.max_evals)
        self.logger.debug("the best hyperopt parameters: %s" % str(best_params))

        json.dump(self.trials.results, open(self.trials_output, 'w'))
        self.logger.info('All the trials results are saved at %s' % self.trials_output)

        best_configuration = self.trials.results[np.argmin(self.trials.losses())]['train_command']
        best_loss = self.trials.results[np.argmin(self.trials.losses())]['loss']
        self.logger.info("\n\nA full training command with the best hyperparameters: \n%s\n\n" % best_configuration)
        self.logger.info("\n\nThe best holdout loss value: \n%s\n\n" % best_loss)

        return best_configuration, best_loss

    def plot_progress(self):
        try:
            sns.set_palette('Set2')
            sns.set_style("darkgrid", {"axes.facecolor": ".95"})
        except:
            pass

        self.logger.debug('plotting...')
        plt.figure(figsize=(15,10))
        plt.subplot(211)
        plt.plot(self.trials.losses(), '.', markersize=12)
        plt.title('Per-Iteration Outer Loss', fontsize=16)
        plt.ylabel('Outer loss function value')
        if self.outer_loss_function in ['logloss']:
            plt.yscale('log')
        xticks = [int(i) for i in np.linspace(plt.xlim()[0], plt.xlim()[1], min(len(self.trials.losses()), 11))]
        plt.xticks(xticks, xticks)


        plt.subplot(212)
        plt.plot(np.minimum.accumulate(self.trials.losses()), '.', markersize=12)
        plt.title('Cumulative Minimum Outer Loss', fontsize=16)
        plt.xlabel('Iteration number')
        plt.ylabel('Outer loss function value')
        xticks = [int(i) for i in np.linspace(plt.xlim()[0], plt.xlim()[1], min(len(self.trials.losses()), 11))]
        plt.xticks(xticks, xticks)

        plt.tight_layout()
        plt.savefig(self.hyperopt_progress_plot)
        self.logger.info('The diagnostic hyperopt progress plot is saved: %s' % self.hyperopt_progress_plot)
Esempio n. 44
0
    def explore(self, x_train, y_train, x_val, y_val, space, model_specs, train_specs, path, max_evals, epochs,
                metric=None, trials=None, net_name='NN', verbose=0, seed=None, val_inference_in_path=None,
                callbacks=None, cuda=None):
        """

        :param x_train:
        :param y_train:
        :param x_val:
        :param y_val:
        :param space:
        :param model_specs:
        :param train_specs:
        :param path:
        :param max_evals:
        :param epochs:
        :param metric:
        :param trials:
        :param net_name:
        :param verbose:
        :param seed:
        :param val_inference_in_path:
        :param callbacks:
        :param cuda:
        :return:
        """
        self.__cuda = cuda
        if trials is None:
            trials = Trials()

        def objective(space):

            # Create model
            specs = space.copy()
            specs.update(model_specs)
            model = self.__model_constructor(**specs)
            if self.__model_constructor_wrapper:
                self.__model_constructor_wrapper(model)
            if self.__cuda in specs and BACKEND != 'tensorflow':
                model.cuda()

            # Print some information
            iteration = len(trials.losses())
            if verbose > 0:
                print('\n')
                print('iteration : {}'.format(0 if trials.losses() is None else iteration))
                [print('{}: {}'.format(key, value)) for key, value in specs.items()]
                print(model.summary(line_length=200))

            # Train model
            trainer_kargs = train_specs.copy()
            trainer_kargs.update({'module': model})
            if callbacks:
                trainer_kargs.update({'callbacks': callbacks})
            trainer = self.__trainer_class(**trainer_kargs)
            train_kargs = {}
            if not any([val_ is None for val_ in [x_val, y_val]]) and \
                    all([val_ in list(getfullargspec(trainer.fit))[0] for val_ in ['x_val', 'y_val']]):
                train_kargs.update({'x_val': x_train, 'y_val': y_train})
            train_kargs.update({'epochs': epochs})
            for karg, val in zip(['verbose'], [verbose]):
                if karg in list(getfullargspec(trainer.fit))[0]:
                    train_kargs.update({'verbose': val})
            trainer.fit(x_train, y_train, **train_kargs)

            # Exploration loss
            exp_loss = None  # ToDo: compatible with custom metric
            if metric in [None, 'categorical_accuracy', 'accuracy']:
                def prepare_for_acc(x):
                    if not isinstance(x, list):
                        x_ = [x]
                    else:
                        x_ = x.copy()
                    for i in range(len(x_)):
                        if len(x_[i].shape) == 1:
                            x_[i] = np.where(x_[i] > 0.5, 1, 0)
                        else:
                            x_[i] = np.argmax(x_[i], axis=-1)
                    return x_
                y_pred = prepare_for_acc(trainer.predict(x_val))
                y_val_ = prepare_for_acc(y_val)
                acc_score = []
                for i in range(len(y_pred)):
                    acc_score.append(accuracy_score(y_pred[i],  y_val_[i]))
                exp_loss = 1 - np.mean(acc_score)
            elif metric == 'i_auc':  # ToDo: make this work
                y_pred = model.predict(x_val)
                if not isinstance(y_pred, list):
                    y_pred = [y_pred]
                exp_loss = []
                for i in np.arange(0, self.__total_n_models):
                    if len(np.bincount(y_val[i][:, -1])) > 1 and not math.isnan(np.sum(y_pred[i])):
                        fpr, tpr, thresholds = metrics.roc_curve(y_val[i][:, -1], y_pred[i][:, -1])
                        exp_loss += [(1 - metrics.auc(fpr, tpr))]
                exp_loss = np.mean(exp_loss) if len(exp_loss) > 0 else 1
            if verbose > 0:
                print('\n')
                print('Exploration Loss: ', exp_loss)
            status = STATUS_OK if not math.isnan(exp_loss) and exp_loss is not None else STATUS_FAIL

            # Save trials
            with open(path + 'trials.hyperopt', 'wb') as f:
                pickle.dump(trials, f)

            # Save model if it is the best so far
            best_exp_losss_name = path + 'best_' + net_name + '_exp_loss'
            trials_losses = [loss_ for loss_ in trials.losses() if loss_]
            best_exp_loss = min(trials_losses) if len(trials_losses) > 0 else None
            print('best val loss so far: ', best_exp_loss)
            print('current val loss: ', exp_loss)
            best_exp_loss_cond = best_exp_loss is None or exp_loss < best_exp_loss
            print('save: ', status, best_exp_loss_cond)
            if status == STATUS_OK and best_exp_loss_cond:
                df = pd.DataFrame(data=[exp_loss], columns=['best_exp_loss'])
                df.to_pickle(best_exp_losss_name)
                self.__save_model(model=model, name=path + 'best_exp_' + net_name + '_json')
                with open(path + 'best_exp_' + net_name + '_hparams', 'wb') as f:
                    pickle.dump(space, f, protocol=pickle.HIGHEST_PROTOCOL)
                if val_inference_in_path is not None:
                    y_val_ = np.concatenate(y_val, axis=1) if isinstance(y_val, list) else y_val
                    np.savetxt(val_inference_in_path + 'val_target.csv', y_val_, delimiter=',')
                    y_inf = trainer.predict(x_val)
                    y_inf = np.concatenate(y_inf, axis=1) if isinstance(y_inf, list) else y_inf
                    np.savetxt(val_inference_in_path + 'val_target_inference.csv', y_inf, delimiter=',')

            clear_session()
            del model

            return {'loss': exp_loss, 'status': status}

        def optimize():

            if len(trials.trials) < max_evals:
                hyperopt.fmin(
                    objective,
                    rstate=None if seed is None else np.random.RandomState(seed),
                    space=space,
                    algo=hyperopt.tpe.suggest,
                    max_evals=max_evals,
                    trials=trials,
                    verbose=True,
                    return_argmin=False)
            with open(path + 'best_exp_' + net_name + '_hparams', 'rb') as f:
                best_hparams = pickle.load(f)

            # Best model
            specs = model_specs.copy()
            specs.update(best_hparams)
            best_model = self.__load_model(name=path + 'best_exp_' + net_name + '_json')
            if BACKEND == 'tensorflow':
                best_model.compile(optimizer=specs['optimizer'], loss=specs['loss'])
            else:
                best_model.cuda()
            print('best hyperparameters: ' + str(best_hparams))

            # Trainer
            trainer_kargs = train_specs.copy()
            trainer_kargs.update({'module': best_model})
            if callbacks:
                trainer_kargs.update({'callbacks': callbacks})
            trainer = self.__trainer_class(**trainer_kargs)
            if hasattr(trainer, 'initialize') and callable(trainer.initialize):
                trainer.initialize()

            return best_model, trainer

        self.__model, self.__trainer = optimize()
Esempio n. 45
0
 for feat_name in feat_names:
     param_space = param_spaces[feat_name]
     log_file = "%s/%s_hyperopt.log" % (log_path, feat_name)
     log_handler = open(log_file, 'wb' )
     writer = csv.writer( log_handler )
     headers = ['trial_counter', 'kappa_mean', 'kappa_std' ]
     for k,v in sorted(param_space.items()):
         headers.append(k)
     writer.writerow( headers )
     log_handler.flush()
     
     print("************************************************************")
     print("Search for the best params")
     #global trial_counter
     trial_counter = 0
     trials = Trials()
     objective = lambda p: hyperopt_wrapper(p,feat_name)
     best_params = fmin(objective, param_space, algo=tpe.suggest,
                        trials=trials, max_evals=param_space["max_evals"])
     for f in int_feat:
         if best_params.has_key(f):
             best_params[f] = int(best_params[f])
     print("************************************************************")
     print("Best params")
     for k,v in best_params.items():
         print "        %s: %s" % (k,v)
     trial_kappas = -np.asarray(trials.losses(), dtype=float)
     best_kappa_mean = max(trial_kappas)
     ind = np.where(trial_kappas == best_kappa_mean)[0][0]
     best_kappa_std = trials.trial_attachments(trials.trials[ind])['std']
     print("Kappa stats")
Esempio n. 46
0
 def __init__(self, *args, **kwargs):
     self._optimization_info = {'trials': Trials(), 'best': {}}
     self._temporary_opt_file = aux.get_temporary_file()
     self.optimize_after_callback_fn = kwargs.get(
         'optimize_after_callback_fn')
def optimize_model_parameter_validation(x, y, model_name=None, loss_function="accuracy", parameter=None, max_evals=100, n_folds=5, isWrite=True, problem_pattern="classification"):
    """
    hyperopt model turning
    """
    if model_name == None and parameter == None:
        print "you must set parameter or model_name"
        return None
    elif parameter != None:
        param = parameter
    elif model_name != None:
        param = parameter_dictionary[model_name]
    else:
        return None

    validation_indexs = []

    if problem_pattern == "classification":
        for train_index, test_index in cross_validation.StratifiedKFold(y, n_folds=n_folds):
            validation_indexs.append((train_index, test_index))
    else:
        for train_index, test_index in cross_validation.KFold(len(y), n_folds=n_folds):
            validation_indexs.append((train_index, test_index))

    trials = Trials()
    function = lambda param: optimize_model_function(
        param, x, y, validation_indexs, loss_function)
    print param
    print "========================================================================"
    best_param = fmin(function, param,
                      algo=tpe.suggest, max_evals=max_evals, trials=trials)
    print "========================================================================"
    print "write result to csv files"

    # write the csv file
    if isWrite:
        datas = []
        for trial_data in trials.trials:
            print trial_data
            trial_parameter_dictionary = {}
            trial_parameter_dictionary['model'] = model_name
            trial_parameter_dictionary['tid'] = trial_data['misc']['tid']
            for key, value in trial_data['misc']['vals'].items():
                print key, value[0]
                trial_parameter_dictionary[key] = value[0]
            trial_parameter_dictionary['loss'] = trial_data['result']['loss']
            trial_parameter_dictionary[
                'status'] = trial_data['result']['status']
            datas.append(trial_parameter_dictionary)
        filename = str(time.time()) + ".csv"
        dictionary_in_list_convert_to_csv(datas, filename)

    print trials.statuses()
    return best_param

    def model_evaluation(clf, x, y, evaluate_function_name, labeled_type, label_convert_type="normal"):
        if evaluate_function_name == "accuracy":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
            score = -score
        elif evaluate_function_name == "logloss":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
            train_score = -score
        elif evaluate_function_name == "mean_squared_error":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
        elif evaluate_function_name == "gini":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
            score = -score
            train_score = -train_score
        elif evaluate_function_name == "rmsle":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
        elif evaluate_function_name == "auc":
            if params['model'] == "XGBREGLOGISTIC":
                y_pred = clf.predict_proba(x_test)
            else:
                y_pred = clf.predict_proba(x_test)[:, 1]

            train_score = evaluate_function(
                y_train, train_y_pred, evaluate_function_name)
            score = evaluate_function(y_test, y_pred, evaluate_function_name)
            score = -score
            train_score = -train_score
        elif evaluate_function_name == "rmspe":
            y_pred = clf.predict(x)
            score = evaluate_function(
                y, y_pred, evaluate_function_name)
            score = score
        return score
Esempio n. 48
0
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'iteration', 'train_time', 'output'])
of_connection.close()


from hyperopt import tpe

# Create the algorithm
tpe_algorithm = tpe.suggest


from hyperopt import Trials

# Record results
bayes_trials = Trials()

from hyperopt import fmin


ITERATION = 1

best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = bayes_trials, max_evals = MAX_EVALS)

print(best)
d=  best
#print(bayes_trials.results)
bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
#bayes_trials_results[:1]

Esempio n. 49
0
def main():

    usage = "%prog <DRLD|MOLD|MIP|Primary|General|PK-...>"
    parser = OptionParser(usage=usage)
    parser.add_option('-m', dest='model', default='basic',
                      help='Model: (basic|GRU|LSTM); default=%default')
    parser.add_option('-o', dest='output_dirname', default='bayes_opt_rnn_mod',
                      help='Output directory name')
    parser.add_option('--reuse', dest='reuse', action="store_true", default=False,
                      help='Use reusable holdout; default=%default')
    parser.add_option('--mod', dest='mod', action="store_true", default=False,
                      help='Use modifications; default=%default')


    (options, args) = parser.parse_args()


    global output_dirname, output_filename, reuse, search_alpha, space, mod, dataset
    reuse = options.reuse
    output_dirname = options.output_dirname
    model = options.model
    mod = options.mod

    dataset = args[0]

    if model == 'basic':
        space['arch']['unit'] = 'basic'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 200, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -4, -1),
    elif model == 'GRU':
        space['arch']['unit'] = 'GRU'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 150, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    elif model == 'LSTM':
        space['arch']['unit'] = 'LSTM'
        space['arch']['n_hidden'] = hp.quniform('n_hidden', 5, 100, 5)
        space['training']['learning_rate'] = hp.loguniform('learning_rate', -5, -1.5),
    else:
        sys.exit('Model not supported!')

    output_dirname += '_' + model

    if reuse:
        output_dirname += '_reuse'

    if mod:
        output_dirname += '_mod'

    output_dirname += '_' + dataset

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), 'log')

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(output_dirname + '\n')
        #output_file.write('reuse = ' + str(reuse) + '\n')

    trials = Trials()
    best = fmin(call_experiment,
                space=space,
                algo=tpe.suggest,
                max_evals=60,
                trials=trials)

    print space_eval(space, best)
    print trials.losses()
Esempio n. 50
0
def hyperopt(param_space, X_train, y_train, X_test, y_test, args):
    resampling = over_sampling.RandomOverSampler(sampling_strategy='auto',
                                                 random_state=42)

    start = time.time()

    def objective_function(params):
        classifier_type = params['type']
        del params['type']
        if classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'svm':
            clf = SVC(**params)
        else:
            return 0

        pl = make_pipeline_imb(resampling, clf)

        score = cross_val_score(pl, X_train, y_train, n_jobs=args.cpus,
                                cv=3).mean()
        return {'loss': -score, 'status': STATUS_OK}

    rstate = np.random.RandomState(1)  # <== Use any number here but fixed

    trials = Trials()
    best_param = fmin(objective_function,
                      param_space,
                      algo=tpe.suggest,
                      max_evals=args.num_eval,
                      trials=trials,
                      rstate=rstate)

    loss = [x['result']['loss'] for x in trials.trials]

    joblib.dump(
        trials,
        os.path.join(
            args.modeldir, 'hyperopt_trials_niters{}_ssize{}.pkl'.format(
                args.num_eval, args.ssize)))

    # best_param_values = [ x for x in best_param.values() ]
    #
    # del best_param_values['classifier_type']
    #
    # if best_param_values[2] == 0:
    # 	max_features = 'auto'
    # else:
    # 	max_features = 'sqrt'
    #
    # if best_param_values[0] == 0:
    # 	bootstrap = 'True'
    # else:
    # 	bootstrap = 'False'
    #
    # print("Best parameters: ", best_param)
    #
    # clf_best = RandomForestClassifier(n_estimators=int(best_param_values[5]),
    # 								  max_features=max_features,
    # 							      max_depth=int(best_param_values[1]),
    # 								  min_samples_leaf=int(best_param_values[3]),
    # 								  min_samples_split=int(best_param_values[4]),
    # 								  bootstrap=bootstrap,
    # 								  n_jobs=args.cpus)
    #
    # pl = make_pipeline_imb(resampling, clf_best)
    #
    # # clf_best.fit(X_train, y_train)
    # estimator_fit = pl.fit(X_train, y_train)
    #
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss) * -1)
    print("Best parameters: ", best_param)
    # print("Test Score: ", estimator_fit.score(X_test, y_test))
    print("Time elapsed: ", round(time.time() - start, 2))
    print("Parameter combinations evaluated: ", args.num_eval)
    #
    # if args.writemodel:
    # 	model_file = os.path.join(args.modeldir, 'model-' + args.classifier + '.h5')
    # 	# -- save the model
    # 	joblib.dump(clf_best, model_file)
    # 	print("Writing the model over path {}".format(model_file))

    return trials
Esempio n. 51
0
def hyper(args):
    adata = io.read_dataset(args.input,
                            transpose=args.transpose,
                            test_split=False)

    hyper_params = {
        "data": {
            "norm_input_log": hp.choice('d_norm_log', (True, False)),
            "norm_input_zeromean": hp.choice('d_norm_zeromean', (True, False)),
            "norm_input_sf": hp.choice('d_norm_sf', (True, False)),
        },
        "model": {
            "lr":
            hp.loguniform("m_lr", np.log(1e-3), np.log(1e-2)),
            "ridge":
            hp.loguniform("m_ridge", np.log(1e-7), np.log(1e-1)),
            "l1_enc_coef":
            hp.loguniform("m_l1_enc_coef", np.log(1e-7), np.log(1e-1)),
            "hidden_size":
            hp.choice("m_hiddensize",
                      ((64, 32, 64), (32, 16, 32), (64, 64), (32, 32),
                       (16, 16), (16, ), (32, ), (64, ), (128, ))),
            "activation":
            hp.choice("m_activation",
                      ('relu', 'selu', 'elu', 'PReLU', 'linear', 'LeakyReLU')),
            "aetype":
            hp.choice("m_aetype", ('zinb', 'zinb-conddisp')),
            "batchnorm":
            hp.choice("m_batchnorm", (True, False)),
            "dropout":
            hp.uniform("m_do", 0, 0.7),
            "input_dropout":
            hp.uniform("m_input_do", 0, 0.8),
        },
        "fit": {
            "epochs": args.hyperepoch
        }
    }

    def data_fn(norm_input_log, norm_input_zeromean, norm_input_sf):

        ad = adata.copy()
        ad = io.normalize(ad,
                          size_factors=norm_input_sf,
                          logtrans_input=norm_input_log,
                          normalize_input=norm_input_zeromean)

        x_train = {'count': ad.X, 'size_factors': ad.obs.size_factors}
        y_train = ad.raw.X

        return (x_train, y_train),

    def model_fn(train_data, lr, hidden_size, activation, aetype, batchnorm,
                 dropout, input_dropout, ridge, l1_enc_coef):

        net = AE_types[aetype](train_data[1].shape[1],
                               hidden_size=hidden_size,
                               l2_coef=0.0,
                               l1_coef=0.0,
                               l2_enc_coef=0.0,
                               l1_enc_coef=l1_enc_coef,
                               ridge=ridge,
                               hidden_dropout=dropout,
                               input_dropout=input_dropout,
                               batchnorm=batchnorm,
                               activation=activation,
                               init='glorot_uniform',
                               debug=args.debug)
        net.build()
        net.model.summary()

        optimizer = opt.__dict__['RMSprop'](lr=lr, clipvalue=5.0)
        net.model.compile(loss=net.loss, optimizer=optimizer)

        return net.model

    output_dir = os.path.join(args.outputdir, 'hyperopt_results')
    objective = CompileFN('autoencoder_hyperpar_db',
                          'myexp1',
                          data_fn=data_fn,
                          model_fn=model_fn,
                          loss_metric='loss',
                          loss_metric_mode='min',
                          valid_split=.2,
                          save_model=None,
                          save_results=True,
                          use_tensorboard=False,
                          save_dir=output_dir)

    test_fn(objective, hyper_params, save_model=None)

    trials = Trials()
    best = fmin(objective,
                hyper_params,
                trials=trials,
                algo=tpe.suggest,
                max_evals=args.hypern,
                catch_eval_exceptions=True)

    with open(os.path.join(output_dir, 'trials.pickle'), 'wb') as f:
        pickle.dump(trials, f)

    #TODO: map indices in "best" back to choice-based hyperpars before saving
    with open(os.path.join(output_dir, 'best.json'), 'wt') as f:
        json.dump(best, f, sort_keys=True, indent=4)

    print(best)
Esempio n. 52
0
def BayesSearch(X, y):
    """Search Hyper parameter"""
    global MODEL
    if MODEL == "log_reg":
        param_space = {
            "solver": hp.choice("solver", ["newton-cg", "saga", "lbfgs"]),
            "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)),
            "C": scope.float(hp.lognormal("C", 0.0001, 3)),
        }
    elif MODEL == "sgd":
        param_space = {
            "loss": hp.choice("loss", ["log", "modified_huber"]),
            "penalty": hp.choice("penalty", ["l2", "l1", "elasticnet"]),
            "alpha": scope.float(hp.uniform("alpha", 0.001, 1)),
            "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)),
        }
    elif MODEL == "rftree":
        param_space = {
            "max_depth":
            scope.int(hp.quniform("max_depth", 6, 15, 1)),
            "n_estimators":
            scope.int(hp.quniform("n_estimators", 100, 1000, 1)),
            "criterion":
            hp.choice("criterion", ["gini", "entropy"]),
            "max_features":
            hp.choice("max_features", ["auto", "log2"]),
            "min_samples_leaf":
            scope.int(hp.quniform("min_samples_leaf", 6, 100, 1)),
            "min_samples_split":
            scope.int(hp.quniform("min_samples_split", 6, 100, 1)),
            #'bootstrap': hp.choice('bootstrap', [True, False]),
        }
    elif MODEL == "extree":
        param_space = {
            "max_depth":
            scope.int(hp.quniform("max_depth", 5, 25, 1)),
            "n_estimators":
            scope.int(hp.quniform("n_estimators", 100, 2000, 1)),
            "criterion":
            hp.choice("criterion", ["gini", "entropy"]),
            "max_features":
            hp.choice("max_features", ["auto", "log2"]),
            "min_samples_leaf":
            scope.int(hp.quniform("min_samples_leaf", 3, 100, 1)),
            "min_samples_split":
            scope.int(hp.quniform("min_samples_split", 3, 100, 1)),
            #'bootstrap': hp.choice('bootstrap', [True, False]),
        }
    elif MODEL == "gbm":
        param_space = {
            "learning_rate":
            scope.float(hp.uniform("learning_rate", 0.001, 1)),
            "n_estimators":
            scope.int(hp.quniform("n_estimators", 100, 2000, 1)),
            "subsample":
            scope.float(hp.uniform("subsample", 0.001, 1)),
            "criterion":
            hp.choice("criterion", ["friedman_mse", "mse", "mae"]),
            "max_features":
            hp.choice("max_features", ["auto", "log2"]),
            "min_samples_leaf":
            scope.int(hp.quniform("min_samples_leaf", 3, 100, 1)),
            "min_samples_split":
            scope.int(hp.quniform("min_samples_split", 3, 100, 1)),
            # 'loss':hp.choice('bootstrap',['deviance', 'exponential']),
        }
    elif MODEL == "knn":
        param_space = {
            "n_neighbors": scope.int(hp.quniform("n_neighbors", 5, 100, 1)),
            "leaf_size": scope.int(hp.quniform("leaf_size", 30, 200, 1)),
        }
    elif MODEL == "lgbm":
        param_space = {
            "learning_rate":
            scope.float(hp.uniform("learning_rate", 0.0001, 0.1)),
            "n_estimators":
            scope.int(hp.quniform("n_estimators", 25, 1000, 1)),
            "max_depth":
            scope.int(hp.quniform("max_depth", 6, 15, 1)),
            "subsample":
            scope.float(hp.uniform("subsample", 0.6, 1)),
            "colsample_bytree":
            scope.float(hp.uniform("colsample_bytree", 0.6, 1)),
            # "subsample_freq":scope.int(hp.quniform("subsample_freq", 0, 5, 1)),
            "min_child_samples":
            scope.int(hp.quniform("min_child_samples", 20, 100, 1)),
            "min_split_gain":
            scope.float(hp.uniform("min_split_gain", 0.01, 1)),
            "reg_alpha":
            scope.float(hp.uniform("reg_alpha", 0.0001, 1)),
            "reg_lambda":
            scope.float(hp.uniform("reg_lambda", 0.0001, 1)),
            "num_leaves":
            scope.int(hp.quniform("num_leaves", 32, 10000, 100)),
        }
    elif MODEL == "xgbm":
        param_space = {
            "learning_rate":
            scope.float(hp.uniform("learning_rate", 0.0001, 0.1)),
            "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000,
                                                  1)),
            "max_depth": scope.int(hp.quniform("max_depth", 6, 10, 1)),
            "subsample": scope.float(hp.uniform("subsample", 0.7, 1)),
            "colsample_bytree":
            scope.float(hp.uniform("colsample_bytree", 0.7, 1)),
            "gamma": scope.int(hp.quniform("gamma", 0, 20, 1)),
            "reg_alpha": scope.float(hp.uniform("reg_alpha", 0.01, 1)),
            "reg_lambda": scope.float(hp.uniform("reg_lambda", 0.01, 1)),
            # "scale_pos_weight":scope.float(hp.uniform("scale_pos_weight", 0.001, 1)),
        }

    # optimize function
    trails = Trials()
    optimization_function = partial(optimize, X=X, y=y)
    result = fmin(
        fn=optimization_function,
        space=param_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=trails,
        verbose=1,
    )

    print("Best Result is:", "_" * 10, result)
    return result, trails
Esempio n. 53
0
def main():
    global objective
    if args.mode == "train":
        train(args.num)
    elif args.mode == "backtest":
        backtest(args.num)
    elif args.mode == "hyperopt":
        if args.train_option == "mongo":
            for _ in range(args.rounds):
                trials = MongoTrials(f'mongo://localhost:1234/my_db/jobs')
                pid = os.fork()
                if pid == 0:
                    # processes = [Process(
                    #     target = os.system("hyperopt-mongo-worker --mongo=localhost:1234/my_db --poll-interval=0.1")
                    # ) for _ in range(args.workers)]
                    # for p in processes:
                    #     p.start()

                    # for p in processes:
                    #     p.join()
                    continue
                else:
                    main_path, types = "../..", "mongo_workers"
                    objective = partial(objective,
                                        args=args,
                                        main_path=main_path,
                                        types=types)
                    best = fmin(fn=objective,
                                space=HP_SPACE,
                                algo=tpe.suggest,
                                max_evals=len(trials._trials) + 8,
                                trials=trials)
                    serialize(best)
                    with open(f"{main_path}/{types}/best_net_config.json",
                              'w') as of:
                        json.dump(best, of)
        elif args.train_option == "normal":
            for _ in range(args.rounds):
                p = Path(f"./agents/trials")
                p.mkdir(parents=True, exist_ok=True)
                if os.path.isfile(f"./agents/trials/trials.p"):
                    with open(f"./agents/trials/trials.p",
                              "rb") as file_trials:
                        trials = pickle.load(file_trials)
                    print("trials loaded")
                else:
                    trials = Trials()
                objective = partial(objective, args=args)
                best = fmin(fn=objective,
                            space=HP_SPACE,
                            algo=tpe.suggest,
                            max_evals=len(trials._dynamic_trials) + 10,
                            trials=trials)
                serialize(best)
                with open(f"./agents/best_net_config.json", 'w') as of:
                    json.dump(best, of)
                with open(f"./agents/trials/trials.p", "wb") as file_trials:
                    pickle.dump(trials, file_trials)
        else:
            raise NameError(
                f"train_option should be set to be mongo or normal, but {args.train_option} is supplied!"
            )
    elif args.mode == "clear_agent":
        clear_var_files_for_agent(args.num, root=".")
    elif args.mode == "clear":
        clear_var_files(root=".")
Esempio n. 54
0
    def work(self, **kwargs):
        self.__dict__.update(kwargs)
        bandit = opt_q_uniform(self.target)
        prior_weight = 2.5
        gamma = 0.20
        algo = partial(
            tpe.suggest,
            prior_weight=prior_weight,
            n_startup_jobs=2,
            n_EI_candidates=128,
            gamma=gamma,
        )

        trials = Trials()
        fmin(passthrough,
             space=bandit.expr,
             algo=algo,
             trials=trials,
             max_evals=self.LEN)
        if self.show_vars:
            import hyperopt.plotting

            hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)

        idxs, vals = miscs_to_idxs_vals(trials.miscs)
        idxs = idxs["x"]
        vals = vals["x"]

        losses = trials.losses()

        from hyperopt.tpe import ap_filter_trials
        from hyperopt.tpe import adaptive_parzen_samplers

        qu = scope.quniform(1.01, 10, 1)
        fn = adaptive_parzen_samplers["quniform"]
        fn_kwargs = dict(size=(4, ), rng=np.random)
        s_below = pyll.Literal()
        s_above = pyll.Literal()
        b_args = [s_below, prior_weight] + qu.pos_args
        b_post = fn(*b_args, **fn_kwargs)
        a_args = [s_above, prior_weight] + qu.pos_args
        a_post = fn(*a_args, **fn_kwargs)

        # print b_post
        # print a_post
        fn_lpdf = getattr(scope, a_post.name + "_lpdf")
        print(fn_lpdf)
        # calculate the llik of b_post under both distributions
        a_kwargs = dict([(n, a) for n, a in a_post.named_args
                         if n not in ("rng", "size")])
        b_kwargs = dict([(n, a) for n, a in b_post.named_args
                         if n not in ("rng", "size")])
        below_llik = fn_lpdf(*([b_post] + b_post.pos_args), **b_kwargs)
        above_llik = fn_lpdf(*([b_post] + a_post.pos_args), **a_kwargs)
        new_node = scope.broadcast_best(b_post, below_llik, above_llik)

        print("=" * 80)

        do_show = self.show_steps

        for ii in range(2, 9):
            if ii > len(idxs):
                break
            print("-" * 80)
            print("ROUND", ii)
            print("-" * 80)
            all_vals = [2, 3, 4, 5, 6, 7, 8, 9, 10]
            below, above = ap_filter_trials(idxs[:ii], vals[:ii], idxs[:ii],
                                            losses[:ii], gamma)
            below = below.astype("int")
            above = above.astype("int")
            print("BB0", below)
            print("BB1", above)
            # print 'BELOW',  zip(range(100), np.bincount(below, minlength=11))
            # print 'ABOVE',  zip(range(100), np.bincount(above, minlength=11))
            memo = {b_post: all_vals, s_below: below, s_above: above}
            bl, al, nv = pyll.rec_eval([below_llik, above_llik, new_node],
                                       memo=memo)
            # print bl - al
            print("BB2", dict(list(zip(all_vals, bl - al))))
            print("BB3", dict(list(zip(all_vals, bl))))
            print("BB4", dict(list(zip(all_vals, al))))
            print("ORIG PICKED", vals[ii])
            print("PROPER OPT PICKS:", nv)

            # assert np.allclose(below, [3, 3, 9])
            # assert len(below) + len(above) == len(vals)

            if do_show:
                plt.subplot(8, 1, ii)
                # plt.scatter(all_vals,
                #    np.bincount(below, minlength=11)[2:], c='b')
                # plt.scatter(all_vals,
                #    np.bincount(above, minlength=11)[2:], c='c')
                plt.scatter(all_vals, bl, c="g")
                plt.scatter(all_vals, al, c="r")
        if do_show:
            plt.show()
Esempio n. 55
0
def main():

    usage = "%prog <DRLD|MIP|MOLD|Primary|General|Terrorist|PK-Brown|PK-Roberts|PK-Pelosi|PK-Cheney>"
    parser = OptionParser(usage=usage)
    parser.add_option("-m", dest="model", default="LR", help="Model: (LR|SVM|MNB|SVMNB); default=%default")
    parser.add_option("-t", dest="test_fold", default=0, help="Test fold; default=%default")
    parser.add_option("-o", dest="output_dirname", default="bayes_opt", help="Output directory name")
    parser.add_option(
        "--reuse", dest="reuse", action="store_true", default=False, help="Use reusable holdout; default=%default"
    )
    parser.add_option(
        "--alpha",
        dest="alpha",
        action="store_true",
        default=False,
        help="Include alpha in search space (instead of grid search); default=%default",
    )
    parser.add_option(
        "--n_dev_folds",
        dest="n_dev_folds",
        default=5,
        help="Number of dev folds to use when tuning/evaluating; default=%default",
    )

    # parser.add_option('--codes', dest='n_codes', default=33,
    #                  help='Number of codes (only matters with --alpha); default=%default')

    (options, args) = parser.parse_args()

    global output_dirname, output_filename, reuse, search_alpha, space, run, group, test_fold, n_dev_folds

    run = args[0]
    reuse = options.reuse
    search_alpha = options.alpha
    # n_codes = int(options.n_codes)
    output_dirname = options.output_dirname
    model = options.model
    test_fold = int(options.test_fold)
    n_dev_folds = int(options.n_dev_folds)

    # allow user to specfiy a particular choice of model
    if model == "LR":
        space["model"] = {
            "model": "LR",
            #'regularization': hp.choice('regularization', ['l1', 'l2'])
            "regularization": "l1",
        }
    elif model == "SVM":
        space["model"] = {
            "model": "SVM",
            "kernel": hp.choice(
                "ktype",
                [{"ktype": "linear"}, {"ktype": "poly", "degree": hp.choice("degree", [2, 3, 4])}, {"ktype": "rbf"}],
            ),
        }
    elif model == "MNB":
        space["model"] = {"model": "MNB"}
    elif model == "SVMNB":
        space["model"] = {"model": "SVMNB", "beta": hp.uniform("beta", 0, 1)}
    else:
        sys.exit("Choice of model not supported!")

    if run == "DRLD":
        add_drld()
        group = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"]
        n_codes = 33
    elif run == "MIP":
        add_MIP()
        group = ["MIP-Personal-1", "MIP-Personal-2", "MIP-Political-1", "MIP-Political-2"]
        n_codes = 74
    elif run == "MOLD":
        add_MOLD()
        group = ["McCain-Likes", "McCain-Dislikes", "Obama-Likes", "Obama-Dislikes"]
        n_codes = 34
    elif run == "Primary":
        add_obama()
        add_clinton()
        group = ["Obama-Primary", "Clinton-Primary"]
        n_codes = 42
    elif run == "General":
        add_obama()
        add_mccain()
        group = ["Obama-General", "McCain-General"]
        n_codes = 41
    elif run == "Terrorists":
        group = [run]
        n_codes = 28
    elif run == "PK-Brown":
        group = [run]
        n_codes = 14
    elif run == "PK-Cheney":
        group = [run]
        n_codes = 12
    elif run == "PK-Pelosi":
        group = [run]
        n_codes = 15
    elif run == "PK-Roberts":
        group = [run]
        n_codes = 14
    else:
        sys.exit("Dataset not recognized")

    output_dirname += "_" + model

    if search_alpha:
        space["alphas"] = []
        for i in range(n_codes):
            space["alphas"].append(hp.loguniform("alpha" + str(i), -1.15, 9.2))
        output_dirname += "_alphas"

    if reuse:
        output_dirname += "_reuse"
    else:
        output_dirname += "_noreuse"
    output_dirname += "_" + run

    if n_dev_folds != 5:
        output_dirname += "_" + str(n_dev_folds)

    output_filename = fh.make_filename(defines.exp_dir, fh.get_basename(output_dirname), "log")

    with codecs.open(output_filename, "w") as output_file:
        output_file.write(output_dirname + "\n")
        output_file.write("reuse = " + str(reuse) + "\n")
        output_file.write("search alphas = " + str(search_alpha) + "\n")

    trials = Trials()
    best = fmin(call_experiment, space=space, algo=tpe.suggest, max_evals=40, trials=trials)

    print space_eval(space, best)
    print trials.losses()
Esempio n. 56
0
                                              batch_size=1,
                                              nb_epoch=1,
                                              validation_split=0.3,
                                              shuffle=False)
        #optimize_history = optimize_model.fit(train_data, H_t, batch_size=1, nb_epoch=1, validation_split=0.3, shuffle=False)
        optimize_model.reset_states()

    loss_v = optimize_history.history['loss']
    print loss_v

    loss_out = loss_v[-1]

    return {'loss': loss_out, 'status': STATUS_OK}


trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=50)

print best

#Building Stateful Model
lstm_hidden = best
tsteps = 24
out_dim = 24

lstm_model = build_lstm_v1.lstm_model_102(lstm_hidden, train_data.shape[2],
                                          out_dim, tsteps)

for ep in range(20):
    lstm_history = lstm_model.fit(train_data,
                                  H_t,
Esempio n. 57
0
    def suggest(self, history, searchspace):
        """
        Suggest params to maximize an objective function based on the
        function evaluation history using a tree of Parzen estimators (TPE),
        as implemented in the hyperopt package.

        Use of this function requires that hyperopt be installed.
        """
        # This function is very odd, because as far as I can tell there's
        # no real documented API for any of the internals of hyperopt. Its
        # execution model is that hyperopt calls your objective function
        # (instead of merely providing you with suggested points, and then
        # you calling the function yourself), and its very tricky (for me)
        # to use the internal hyperopt data structures to get these predictions
        # out directly.

        # so they path we take in this function is to construct a synthetic
        # hyperopt.Trials database which from the `history`, and then call
        # hyoperopt.fmin with a dummy objective function that logs the value
        # used, and then return that value to our client.

        # The form of the hyperopt.Trials database isn't really documented in
        # the code -- most of this comes from reverse engineering it, by
        # running fmin() on a simple function and then inspecting the form of
        # the resulting trials object.
        if 'hyperopt' not in sys.modules:
            raise ImportError('No module named hyperopt')

        random = check_random_state(self.seed)
        hp_searchspace = searchspace.to_hyperopt()

        trials = Trials()
        for i, (params, scores, status) in enumerate(history):
            if status == 'SUCCEEDED':
                # we're doing maximization, hyperopt.fmin() does minimization,
                # so we need to swap the sign
                result = {'loss': -np.mean(scores), 'status': STATUS_OK}
            elif status == 'PENDING':
                result = {'status': STATUS_RUNNING}
            elif status == 'FAILED':
                result = {'status': STATUS_FAIL}
            else:
                raise RuntimeError('unrecognized status: %s' % status)

            # the vals key in the trials dict is basically just the params
            # dict, but enum variables (hyperopt hp.choice() nodes) are
            # different, because the index of the parameter is specified
            # in vals, not the parameter itself.

            vals = {}
            for var in searchspace:
                if isinstance(var, EnumVariable):
                    # get the index in the choices of the parameter, and use
                    # that.
                    matches = [i for i, c in enumerate(var.choices)
                               if c == params[var.name]]
                    assert len(matches) == 1
                    vals[var.name] = matches
                else:
                    # the other big difference is that all of the param values
                    # are wrapped in length-1 lists.
                    vals[var.name] = [params[var.name]]

            trials.insert_trial_doc({
                'misc': {
                    'cmd': ('domain_attachment', 'FMinIter_Domain'),
                    'idxs': dict((k, [i]) for k in hp_searchspace.keys()),
                    'tid': i,
                    'vals': vals,
                    'workdir': None},
                'result': result,
                'tid': i,
                # bunch of fixed fields that hyperopt seems to require
                'owner': None, 'spec': None, 'state': 2, 'book_time': None,
                'exp_key': None, 'refresh_time': None, 'version': 0
                })

        trials.refresh()
        chosen_params_container = []

        def mock_fn(x):
            # http://stackoverflow.com/a/3190783/1079728
            # to get around no nonlocal keywork in python2
            chosen_params_container.append(x)
            return 0

        fmin(fn=mock_fn, algo=tpe.suggest, space=hp_searchspace, trials=trials,
             max_evals=len(trials.trials)+1,
             **self._hyperopt_fmin_random_kwarg(random))
        chosen_params = chosen_params_container[0]

        return chosen_params
Esempio n. 58
0
        data_path_label = os.path.join(data_path,'layer_'+str(layer),'Label_cluster')
        path = os.path.join(data_path_label,'label_cluster_'+str(num_clust)+'.npz')
        data = np.load(path)
        label = data['a']
        outputpath = os.path.join(main_path,str(num_clust)+'_clustering')
        create_saving_folder(outputpath)
        print('################### Tests for a ',num_clust,' centers clustering ###################')
        for nbc in range(2,num_clust+1):
            print('################ Predicting voxels for cluster number ',nbc,' ################')
            masker = find_cluster(label,masker,data_path,nbc)
            mean = masker.transform(score_img).mean()
            maxi = masker.transform(score_img).max()
            print('Mean: ',mean, 'Max: ', maxi)
            print('')

            if optimized == True:
                #we look for optimizations 
                if num_clust == nbc == 2:
                    best_run, best_model = optim.minimize(model=model_optimization, data=retrieve_data, algo=tpe.suggest, max_evals=5, trials=Trials())
                    X_train,y_train,X_test,y_test = retrieve_data(loaded_stimuli,fmri_ready)
                    print("Evalutation of best performing model:")
                    print(best_model.evaluate(X_test, y_test))
                    print("\nBest performing model chosen hyper-parameters:")
                    print(best_run)
                else:
                    pass
                second_processing(masker,filename_irm,filename_stimuli,outputpath,meanepi,alpha,nbc,num_clust,best_run)
            
            else:

                second_processing(masker,filename_irm,filename_stimuli,outputpath,meanepi,alpha,nbc,num_clust)
Esempio n. 59
0
    def work(self, **kwargs):
        self.__dict__.update(kwargs)
        bandit = opt_q_uniform(self.target)
        prior_weight = 2.5
        gamma = 0.20
        algo = partial(tpe.suggest,
                prior_weight=prior_weight,
                n_startup_jobs=2,
                n_EI_candidates=128,
                gamma=gamma)
        #print algo.opt_idxs['x']
        #print algo.opt_vals['x']

        trials = Trials()
        fmin(passthrough,
            space=bandit.expr,
            algo=algo,
            trials=trials,
            max_evals=self.LEN)
        if self.show_vars:
            import hyperopt.plotting
            hyperopt.plotting.main_plot_vars(trials, bandit, do_show=1)

        idxs, vals = miscs_to_idxs_vals(trials.miscs)
        idxs = idxs['x']
        vals = vals['x']

        losses = trials.losses()

        from hyperopt.tpe import ap_filter_trials
        from hyperopt.tpe import adaptive_parzen_samplers

        qu = scope.quniform(1.01, 10, 1)
        fn = adaptive_parzen_samplers['quniform']
        fn_kwargs = dict(size=(4,), rng=np.random)
        s_below = pyll.Literal()
        s_above = pyll.Literal()
        b_args = [s_below, prior_weight] + qu.pos_args
        b_post = fn(*b_args, **fn_kwargs)
        a_args = [s_above, prior_weight] + qu.pos_args
        a_post = fn(*a_args, **fn_kwargs)

        #print b_post
        #print a_post
        fn_lpdf = getattr(scope, a_post.name + '_lpdf')
        print fn_lpdf
        # calculate the llik of b_post under both distributions
        a_kwargs = dict([(n, a) for n, a in a_post.named_args
                    if n not in ('rng', 'size')])
        b_kwargs = dict([(n, a) for n, a in b_post.named_args
                    if n not in ('rng', 'size')])
        below_llik = fn_lpdf(*([b_post] + b_post.pos_args), **b_kwargs)
        above_llik = fn_lpdf(*([b_post] + a_post.pos_args), **a_kwargs)
        new_node = scope.broadcast_best(b_post, below_llik, above_llik)

        print '=' * 80

        do_show = self.show_steps

        for ii in range(2, 9):
            if ii > len(idxs):
                break
            print '-' * 80
            print 'ROUND', ii
            print '-' * 80
            all_vals = [2, 3, 4, 5, 6, 7, 8, 9, 10]
            below, above = ap_filter_trials(idxs[:ii],
                    vals[:ii], idxs[:ii], losses[:ii], gamma)
            below = below.astype('int')
            above = above.astype('int')
            print 'BB0', below
            print 'BB1', above
            #print 'BELOW',  zip(range(100), np.bincount(below, minlength=11))
            #print 'ABOVE',  zip(range(100), np.bincount(above, minlength=11))
            memo = {b_post: all_vals, s_below: below, s_above: above}
            bl, al, nv = pyll.rec_eval([below_llik, above_llik, new_node],
                    memo=memo)
            #print bl - al
            print 'BB2', dict(zip(all_vals, bl - al))
            print 'BB3', dict(zip(all_vals, bl))
            print 'BB4', dict(zip(all_vals, al))
            print 'ORIG PICKED', vals[ii]
            print 'PROPER OPT PICKS:', nv

            #assert np.allclose(below, [3, 3, 9])
            #assert len(below) + len(above) == len(vals)

            if do_show:
                plt.subplot(8, 1, ii)
                #plt.scatter(all_vals,
                #    np.bincount(below, minlength=11)[2:], c='b')
                #plt.scatter(all_vals,
                #    np.bincount(above, minlength=11)[2:], c='c')
                plt.scatter(all_vals, bl, c='g')
                plt.scatter(all_vals, al, c='r')
        if do_show:
            plt.show()
              callbacks=[earlyStopping, history],
              verbose=0,
              nb_epoch=100)

    print('MSE:', earlyStopping.best)
    return {'loss': earlyStopping.best, 'status': STATUS_OK}


n = 0

#https://github.com/keras-team/keras/issues/3945#issuecomment-281312732
from keras import backend as K
K.set_image_dim_ordering('th')
#for ordering error karas 1->2

trials = Trials()
best = hypfmin(f_nn, space, algo=tpe.suggest, max_evals=50, trials=trials)
print('best: ')
print(best)

with open(model_params_dir + 'hyperparam_test.pkl', 'w') as f:
    pickle.dump(trials.trials, f)

opt_params = {}

for p in best:
    opt_params[p] = hyperparams[p][best[p]]

opt_params

model = create_model(opt_params)