def keras_classifier_bayesian_opt(model_config, input_length): bayesian_opt = KerasClassifierBayesianOpt( model_config, keras_classifier_bayesian_space(), input_length) bayesian_opt.nkfolds = 3 bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"]) bayesian_opt.scoring_opt = "AUC" bayesian_opt.low_is_better = False bayesian_opt.n_trials = 30 return bayesian_opt
def xgboost_classifier_bayesian_opt(model_config): bayesian_opt = XGBoostClassifierBayesianOpt( model_config, xgboost_classifier_bayesian_space()) bayesian_opt.nkfolds = 3 bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"]) bayesian_opt.scoring_opt = "AUC" bayesian_opt.low_is_better = False bayesian_opt.n_trials = 100 return bayesian_opt
def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, out_dirs, ncores=-1): """Hyperparameter grid search for a list of classifiers Given a list of classifiers, do a hyperparameter grid search based on a corresponding set of parameters Args: names: iteratable of classifier names classifiers: iterable of classifiers grid_params: iterable of parameters used to perform the grid search x_train: feature dataframe y_train: targets dataframe nkfolds: int, cross-validation generator or an iterable out_dirs: Write parameters and pickle of summary dataframe ncores: number of cores to distribute jobs to Returns: lists of grid search models, the best model and scoring dataframes """ logger = get_logger() for clf_name, clf, gps, out_dir in zip(names, classifiers, grid_params, out_dirs): if not gps: logger.info("Nothing to be done for grid search of model %s", clf_name) continue logger.info("Grid search for model %s with following parameters:", clf_name) print_dict(gps) # To work for probabilities. This will call model.decision_function or # model.predict_proba as it is done for the nominal ROC curves as well to decide on the # performance scoring = get_scorers(gps["scoring"]) grid_search = GridSearchCV(clf, gps["params"], cv=nkfolds, refit=gps["refit"], scoring=scoring, n_jobs=ncores, verbose=2, return_train_score=True) grid_search.fit(x_train, y_train) cvres = grid_search.cv_results_ # Save the results as soon as we have them in case something goes wrong later # (would be quite unfortunate to loose grid search reults...) out_file = osjoin(out_dir, "results.pkl") pickle.dump(pd.DataFrame(cvres), openfile(out_file, "wb"), protocol=4) # Parameters dump_yaml_from_dict(gps, osjoin(out_dir, "parameters.yaml")) savemodels((clf_name, ), (grid_search.best_estimator_, ), out_dir, "")