Ejemplo n.º 1
0
def keras_classifier_bayesian_opt(model_config, input_length):
    bayesian_opt = KerasClassifierBayesianOpt(
        model_config, keras_classifier_bayesian_space(), input_length)
    bayesian_opt.nkfolds = 3
    bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"])
    bayesian_opt.scoring_opt = "AUC"
    bayesian_opt.low_is_better = False
    bayesian_opt.n_trials = 30
    return bayesian_opt
def xgboost_classifier_bayesian_opt(model_config):
    bayesian_opt = XGBoostClassifierBayesianOpt(
        model_config, xgboost_classifier_bayesian_space())
    bayesian_opt.nkfolds = 3
    bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"])
    bayesian_opt.scoring_opt = "AUC"
    bayesian_opt.low_is_better = False
    bayesian_opt.n_trials = 100
    return bayesian_opt
Ejemplo n.º 3
0
def do_gridsearch(names,
                  classifiers,
                  grid_params,
                  x_train,
                  y_train,
                  nkfolds,
                  out_dirs,
                  ncores=-1):
    """Hyperparameter grid search for a list of classifiers

    Given a list of classifiers, do a hyperparameter grid search based on a corresponding
    set of parameters

    Args:
        names: iteratable of classifier names
        classifiers: iterable of classifiers
        grid_params: iterable of parameters used to perform the grid search
        x_train: feature dataframe
        y_train: targets dataframe
        nkfolds: int, cross-validation generator or an iterable
        out_dirs: Write parameters and pickle of summary dataframe
        ncores: number of cores to distribute jobs to
    Returns:
        lists of grid search models, the best model and scoring dataframes
    """

    logger = get_logger()

    for clf_name, clf, gps, out_dir in zip(names, classifiers, grid_params,
                                           out_dirs):
        if not gps:
            logger.info("Nothing to be done for grid search of model %s",
                        clf_name)
            continue
        logger.info("Grid search for model %s with following parameters:",
                    clf_name)
        print_dict(gps)

        # To work for probabilities. This will call model.decision_function or
        # model.predict_proba as it is done for the nominal ROC curves as well to decide on the
        # performance
        scoring = get_scorers(gps["scoring"])

        grid_search = GridSearchCV(clf,
                                   gps["params"],
                                   cv=nkfolds,
                                   refit=gps["refit"],
                                   scoring=scoring,
                                   n_jobs=ncores,
                                   verbose=2,
                                   return_train_score=True)
        grid_search.fit(x_train, y_train)
        cvres = grid_search.cv_results_

        # Save the results as soon as we have them in case something goes wrong later
        # (would be quite unfortunate to loose grid search reults...)
        out_file = osjoin(out_dir, "results.pkl")
        pickle.dump(pd.DataFrame(cvres), openfile(out_file, "wb"), protocol=4)
        # Parameters
        dump_yaml_from_dict(gps, osjoin(out_dir, "parameters.yaml"))
        savemodels((clf_name, ), (grid_search.best_estimator_, ), out_dir, "")