Ejemplo n.º 1
0
def train_EXT(estimator, trainX, trainY, method, n_jobs=4, skip=False):
    # Extremely Randomized Trees
    logger = misc.init_logger(method)
    xmlPath = os.path.join(os.path.dirname(__file__), "params",
                           '%s.xml' % method[method.find('_') + 1:])
    if not skip:
        logger.info("Begin to train ExtraTrees...")
        misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs)

        # fine tune n_estimators
        param_grid = {"n_estimators": np.arange(50, 601, 50)}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=False,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(n_estimators=best_params['n_estimators'])

        # fine tune max_depth and min_samples_split
        param_grid = {
            "max_depth": np.arange(5, 30, 2),
            "min_samples_split": np.arange(0.005, 0.031, 0.005)
        }
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=False,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(
            max_depth=best_params['max_depth'],
            min_samples_split=best_params['min_samples_split'])

        # fine tune min_samples_split and min_samples_leaf
        param_grid = {"min_samples_leaf": np.arange(5, 51, 5)}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=False,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(min_samples_leaf=best_params['min_samples_leaf'])

        # fine tune max_features
        feat_num = len(list(trainX.columns))
        param_grid = {
            "max_features":
            np.arange(int(np.sqrt(feat_num)), int(0.4 * feat_num), 2)
        }
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=False,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        if best_params['max_features'] == int(np.sqrt(feat_num)):
            estimator.set_params(max_features='auto')
        else:
            estimator.set_params(max_features=best_params['max_features'])

        # refine-tune n_estimators
        param_grid = {"n_estimators": np.arange(40, 1001, 40)}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=False,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(n_estimators=best_params['n_estimators'])
        misc.update_params_toXML(estimator, method, xmlPath)

        logger.info("After parameters tuning, Get the CV score...")
        misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs)
    else:
        try:
            estimator = misc.load_params_fromXML(estimator, method, xmlPath)
        except Exception:
            return estimator
    logger.info("After parameters tuning. The current parameters are\n %s" %
                str(estimator.get_params()))
    return estimator
Ejemplo n.º 2
0
def train_GBDT(estimator, trainX, trainY, method, n_jobs=4, skip=False):
    # GBDT
    logger = misc.init_logger(method)
    xmlPath = os.path.join(os.path.dirname(__file__), "params",
                           '%s.xml' % method[method.find('_') + 1:])
    if not skip:
        logger.info("Begin to train GBDT...")
        misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs)

        # fine tune n_estimators
        param_grid = {"n_estimators": np.arange(50, 601, 50)}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=True,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        best_n_estimators = best_params['n_estimators']
        estimator.set_params(n_estimators=best_n_estimators)

        # fine tune max_depth and min_samples_split
        param_grid = {
            "max_depth": np.arange(5, 30, 2),
            "min_samples_split": np.arange(0.005, 0.031, 0.005)
        }
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=True,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(
            max_depth=best_params['max_depth'],
            min_samples_split=best_params['min_samples_split'])

        # fine tune min_samples_split and min_samples_leaf
        param_grid = {"min_samples_leaf": np.arange(5, 51, 5)}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=True,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(min_samples_leaf=best_params['min_samples_leaf'])

        # fine tune max_features
        feat_num = len(list(trainX.columns))
        param_grid = {
            "max_features":
            np.arange(int(np.sqrt(feat_num)), int(0.4 * feat_num), 2)
        }
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=True,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        if best_params['max_features'] == int(np.sqrt(feat_num)):
            estimator.set_params(max_features='auto')
        else:
            estimator.set_params(max_features=best_params['max_features'])

        # fine tune subsample
        param_grid = {"subsample": [0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}
        best_params, best_score = misc.run_gridsearch(trainX,
                                                      trainY,
                                                      estimator,
                                                      param_grid,
                                                      sample_weight=True,
                                                      cv=5,
                                                      scoring='roc_auc',
                                                      n_jobs=n_jobs,
                                                      method=method)
        estimator.set_params(subsample=best_params['subsample'])

        # refine-tune n_estimatosr
        pairs = [(0.1, best_n_estimators),
                 (0.075, int(best_n_estimators * 4.0 / 3)),
                 (0.05, best_n_estimators * 2),
                 (0.04, int(best_n_estimators * 5.0 / 2)),
                 (0.03, int(best_n_estimators * 10.0 / 3)),
                 (0.01, best_n_estimators * 10),
                 (0.005, best_n_estimators * 20)]
        max_n_estimators = 2400
        opt_params = None
        opt_score = 0.0
        for learning_rate, n_estimators in pairs:
            if n_estimators > max_n_estimators:
                break
            estimator.set_params(learning_rate=learning_rate,
                                 n_estimators=n_estimators)
            auc_score, acc_score = misc.modelfit(estimator,
                                                 trainX,
                                                 trainY,
                                                 method,
                                                 n_jobs=n_jobs)
            logger.info(
                "With learning_rate %s, n_estimators %s, auc_score is %s, acc_score is %s"
                % (learning_rate, n_estimators, auc_score, acc_score))
            if auc_score > opt_score:
                opt_params = (learning_rate, n_estimators)
                opt_score = auc_score
        logger.info(
            "best learning_rate is %s, best n_estimators is %s. The corresponding auc_score is %s"
            % (opt_params[0], opt_params[1], opt_score))
        estimator.set_params(learning_rate=opt_params[0],
                             n_estimators=opt_params[1])
        misc.update_params_toXML(estimator, method, xmlPath)

        logger.info("After parameters tuning, Get the CV score...")
        misc.modelfit(estimator, trainX, trainY, method, n_jobs=n_jobs)
    else:
        try:
            estimator = misc.load_params_fromXML(estimator, method, xmlPath)
        except Exception:
            return estimator
    logger.info("After parameters tuning. The current parameters are\n %s" %
                str(estimator.get_params()))
    return estimator