Ejemplo n.º 1
0
def build_model(X_train, y_train, X_valid, y_valid):
    best_params = {
        'base_score': 2,
        'colsample_bylevel': 0.75,
        'colsample_bynode': 0.57,
        'colsample_bytree': 0.95,
        'gamma': 0.25,
        'learning_rate': 1.7,
        'max_depth': 18,
        'min_child_weight': 0.025,
        'n_estimators': 353,
        'n_jobs': -1,
        'num_class': 3,
        'num_parallel_tree': 105,
        'objective': 'multi:softmax',
        'random_state': 42,
        'subsample': 0.8,
        'verbosity': 0,
        'reg_alpha': 0.05,
        'reg_lambda': 1,
        'rate_drop': 0.5
    }
    best_xgb = XGBRFClassifier(**best_params)

    best_xgb.fit(X_train, y_train,
                 eval_set=[(X_train, y_train),
                           (X_valid, y_valid)],
                 eval_metric=['merror'],
                 early_stopping_rounds=50,
                 callbacks=[print_evaluation(period=5),
                            early_stop(stopping_rounds=15)],
                 verbose=False,)
    return best_xgb
def train_and_evaluate_xgb(train, early, val, params):
    X_train, y_train = train
    X_early, y_early = early
    X_val, y_val = val

    # create model
    model = create_xgb_model(params)

    # drop skew
    for col in X_train.columns:
        if 'skew' in col:
            X_train.drop(col, 1, inplace=True)
            X_early.drop(col, 1, inplace=True)
            X_val.drop(col, 1, inplace=True)

    # Fit model using early stopping
    early = early_stop(stopping_rounds=30, maximize=False)
    model.fit(X_train.drop(['ID', 'ID_temp'], 1),
              y_train,
              eval_set=[(X_train.drop(['ID', 'ID_temp'], 1), y_train),
                        (X_early.drop(['ID', 'ID_temp'], 1), y_early)],
              callbacks=[early])
    # Validation evaluation
    y_val_preds = model.predict(X_val.drop(['ID', 'ID_temp'], 1))
    cur_mse = mean_squared_error(y_val_preds, y_val)
    return cur_mse
Ejemplo n.º 3
0
    def run(self,
            data,
            y,
            groups,
            test,
            eval_metric,
            n_splits=10,
            early_stopping_rounds=100):

        oof_preds_LGBM = np.zeros((data.shape[0]))
        sub_preds_LGBM = np.zeros((test.shape[0]))
        df_sub_preds_LGBM = pd.DataFrame()
        self.df_feature_importance = pd.DataFrame()

        if not self.clf:
            self.build_clf()

        folds = GroupKFold(n_splits=n_splits)
        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(data, y, groups)):
            train_x, train_y = data.iloc[train_idx], y.iloc[train_idx]
            valid_x, valid_y = data.iloc[valid_idx], y.iloc[valid_idx]
            print("Starting LightGBM. Fold {},Train shape: {}, test shape: {}".
                  format(n_fold + 1, data.shape, test.shape))

            self.clf.fit(train_x,
                         train_y,
                         eval_set=[(train_x, train_y), (valid_x, valid_y)],
                         eval_metric=eval_metric,
                         verbose=100,
                         callbacks=[
                             early_stop(early_stopping_rounds,
                                        maximize=True,
                                        verbose=True)
                         ])

            oof_preds_LGBM[valid_idx] += self.clf.predict_proba(valid_x)[:, 1]
            #             sub_preds_LGBM += self.clf.predict_proba(test)[:, 1]/ (folds.n_splits)
            df_sub_preds_LGBM['fold_{}'.format(
                n_fold)] = self.clf.predict_proba(test)[:, 1]

            df_fold_importance = pd.DataFrame()
            df_fold_importance["feature"] = self.features
            df_fold_importance["importance"] = self.clf.feature_importances_
            df_fold_importance["fold"] = n_fold + 1

            self.df_feature_importance = pd.concat(
                [self.df_feature_importance, df_fold_importance], axis=0)

        print('Summary:')
        print('XGB Testing_Set average_precision_score %.6f' %
              average_precision_score(y, oof_preds_LGBM))

        return oof_preds_LGBM, df_sub_preds_LGBM, self.clf
Ejemplo n.º 4
0
        def xgb_evaluate(min_child_weight, colsample_bytree, max_depth,
                         subsample, gamma, alpha, max_delta_step):
            params['min_child_weight'] = int(min_child_weight)
            params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
            params['max_depth'] = int(max_depth)
            params['subsample'] = max(min(subsample, 1), 0)
            params['gamma'] = max(gamma, 0)
            params['alpha'] = max(alpha, 0)
            params['max_delta_step'] = max(int(max_delta_step), 0)

            cv_result = xgb.cv(params,
                               xgtrain,
                               num_boost_round=num_rounds,
                               nfold=5,
                               seed=random_state,
                               callbacks=[callback.early_stop(50)])
            return cv_result['test-auc-mean'].values[-1]
Ejemplo n.º 5
0
def train(params,
          dtrain,
          num_boost_round=10,
          evals=(),
          obj=None,
          feval=None,
          maximize=False,
          early_stopping_rounds=None,
          evals_result=None,
          verbose_eval=True,
          xgb_model=None,
          callbacks=None,
          learning_rates=None):
    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
    """Train a booster with given parameters.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : DMatrix
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
    evals: list of pairs (DMatrix, string)
        List of items to be evaluated during training, this allows user to watch
        performance on the validation set.
    obj : function
        Customized objective function.
    feval : function
        Customized evaluation function.
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds: int
        Activates early stopping. Validation error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue training.
        Requires at least one item in evals.
        If there's more than one, will use the last.
        Returns the model from the last iteration (not the best one).
        If early stopping occurs, the model will have three additional fields:
        bst.best_score, bst.best_iteration and bst.best_ntree_limit.
        (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
        and/or num_class appears in the parameters)
    evals_result: dict
        This dictionary stores the evaluation results of all the items in watchlist.
        Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
        a parameter containing ('eval_metric': 'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953']},
                  'eval': {'logloss': ['0.480385', '0.357756']}}
    verbose_eval : bool or int
        Requires at least one item in evals.
        If `verbose_eval` is True then the evaluation metric on the validation set is
        printed at each boosting stage.
        If `verbose_eval` is an integer then the evaluation metric on the validation set
        is printed at every given `verbose_eval` boosting stage. The last boosting stage
        / the boosting stage found by using `early_stopping_rounds` is also printed.
        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function (deprecated - use callback API instead)
        List of learning rate for each boosting round
        or a customized function that calculates eta in terms of
        current number of round and the total number of boosting round (e.g. yields
        learning rate decay)
    xgb_model : file name of stored xgb model or 'Booster' instance
        Xgb model to be loaded before training (allows training continuation).
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks by using xgb.callback module.
        Example: [xgb.callback.reset_learning_rate(custom_rates)]

    Returns
    -------
    booster : a trained booster model
    """
    callbacks = [] if callbacks is None else callbacks

    # Most of legacy advanced options becomes callbacks
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation())
    else:
        if isinstance(verbose_eval, int):
            callbacks.append(callback.print_evaluation(verbose_eval))

    if early_stopping_rounds is not None:
        callbacks.append(
            callback.early_stop(early_stopping_rounds,
                                maximize=maximize,
                                verbose=bool(verbose_eval)))
    if evals_result is not None:
        callbacks.append(callback.record_evaluation(evals_result))

    if learning_rates is not None:
        warnings.warn(
            "learning_rates parameter is deprecated - use callback API instead",
            DeprecationWarning)
        callbacks.append(callback.reset_learning_rate(learning_rates))

    return _train_internal(params,
                           dtrain,
                           num_boost_round=num_boost_round,
                           evals=evals,
                           obj=obj,
                           feval=feval,
                           xgb_model=xgb_model,
                           callbacks=callbacks)
Ejemplo n.º 6
0
def cv(params,
       X_train,
       y_train,
       features=None,
       num_boost_round=20,
       nfold=3,
       folds=None,
       metrics=(),
       obj=None,
       feval=None,
       maximize=False,
       early_stopping_rounds=None,
       fpreproc=None,
       as_pandas=True,
       verbose_eval=None,
       show_stdv=True,
       seed=1234,
       callbacks=None):
    '''
    Cross-validation with given parameters. Madified from cv method found in
    xgboost package (https://github.com/dmlc/xgboost) to use spatial data with
    statistical features without risking data bleed.
    Parameters
    ----------
    params : dict
        Booster params
    X_train : pandas.DataFrame
        X data to be trained
    y_train : pandas.DataFrame
        y data to be trained
    features : list
        features selected to be trained
    num_boost_round : int : 20
        Number of boosting iterations.
    nfold : int : 3
        Number of folds in CV.
    folds : a KFold or StratifiedKFold instance or list of fold indeces
        Sklearn KFolds or StratifiedKFolds object.
        Alternatively may explicitly pass sample indices for each fold.
        For ``n`` folds, **folds** should be a length ``n`` list of tuples.
        Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used
        as the training samples for the ``n``th fold and ``out`` is a list of
        indices to be used as the testing samples for the ``n``th fold.
    metrics : string ot list of strings
        Evaluation metrics to be watches in CV.
    obj : function
        Custom objective function.
    feval : function
        Custom evaluation function.
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds : int
        Activates early stopping. Cross-validation metric (average of
        validation metric computed over CV folds) needs to improve at least
        once in every **early_stopping_rounds** round(s) to continue training.
        The last entry in the evaluation history will represent the best
        iteration. If there's more than one metric in the **eval_metric**
        parameter given **params**, the last metric will be used for early
        stopping.
    fpreproc : function
        Preprocessing function that takes (dtrain, dtest, param) and returns
        transformed versions of those.
    as_pandas : bool : True
        Return pd.DataFrame when pandas is installed.
        If False or pandas is not installed, return np.ndarray
    verbose_eval : bool, int, or None : None
        Whether to display the progress. If None, progress will be displayed
        when np.ndarray is returned. If True, progress will be displayed at
        boosting stage. If an integer is given, progress will be displayed at
        every given `verbose_eval` boosting stage.
    show_stdv : bool : True
        Whether to display the standard deviation in progress.
        Results are not affected, and always contains std.
    seed : int : 1234
        seed used to generate folds (passed to numpy.random.seed).
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks using :ref:`Callback API
        <callback_api>`.
        Example:
            .. code-block:: python
            [xgb.callback.reset_learning_rate(custom_rates)]
    Returns
    -------
    results : pandas.DataFrame
        results of crossvalidated model with metrics of each boosted round
    '''
    metrics = list(metrics)
    #if isinstance(metrics, str):
    #    print('test')
    #    metrics = [metrics]
    if not features:
        features = X_train.columns
    if isinstance(params, list):
        _metrics = [x[1] for x in params if x[0] == 'eval_metric']
        params = dict(params)
        if 'eval_metric' in params:
            params['eval_metric'] = _metrics
    else:
        params = dict((k, v) for k, v in params.items())
    if (not metrics) and 'eval_metric' in params:
        if isinstance(params['eval_metric'], list):
            metrics = params['eval_metric']
        else:
            metrics = [params['eval_metric']]
    params.pop("eval_metric", None)
    results = {}
    # create folds in data
    cvfolds, wt_list = mknfold(X_train, y_train, nfold, params, metrics,
                               features)
    # setup callbacks
    callbacks = [] if callbacks is None else callbacks
    if early_stopping_rounds is not None:
        callbacks.append(
            callback.early_stop(early_stopping_rounds,
                                maximize=maximize,
                                verbose=False))
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
    elif isinstance(verbose_eval, int):
        callbacks.append(
            callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)
    ]
    callbacks_after_iter = [
        cb for cb in callbacks
        if not cb.__dict__.get('before_iteration', False)
    ]
    for i in range(num_boost_round):
        for cb in callbacks_before_iter:
            cb(
                CallbackEnv(model=None,
                            cvfolds=cvfolds,
                            iteration=i,
                            begin_iteration=0,
                            end_iteration=num_boost_round,
                            rank=0,
                            evaluation_result_list=None))
        for fold in cvfolds:
            fold.update(i, obj)
        res = aggcv([f.eval(i, feval) for f in cvfolds], wt_list)
        for key, mean, std in res:
            if key + '-mean' not in results:
                results[key + '-mean'] = []
            if key + '-std' not in results:
                results[key + '-std'] = []
            results[key + '-mean'].append(mean)
            results[key + '-std'].append(std)
        try:
            for cb in callbacks_after_iter:
                cb(
                    CallbackEnv(model=None,
                                cvfolds=cvfolds,
                                iteration=i,
                                begin_iteration=0,
                                end_iteration=num_boost_round,
                                rank=0,
                                evaluation_result_list=res))
        except EarlyStopException as e:
            for k in results:
                results[k] = results[k][:(e.best_iteration + 1)]
            break
    if as_pandas:
        results = pd.DataFrame.from_dict(results)
    return results
Ejemplo n.º 7
0
def cv(params,
       dtrain,
       num_boost_round=10,
       nfold=3,
       stratified=False,
       folds=None,
       metrics=(),
       obj=None,
       feval=None,
       maximize=False,
       early_stopping_rounds=None,
       fpreproc=None,
       as_pandas=True,
       verbose_eval=None,
       show_stdv=True,
       seed=0,
       callbacks=None,
       shuffle=True):
    # pylint: disable = invalid-name
    """Cross-validation with given parameters.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : DMatrix
        Data to be trained.
    num_boost_round : int
        Number of boosting iterations.
    nfold : int
        Number of folds in CV.
    stratified : bool
        Perform stratified sampling.
    folds : a KFold or StratifiedKFold instance
        Sklearn KFolds or StratifiedKFolds.
    metrics : string or list of strings
        Evaluation metrics to be watched in CV.
    obj : function
        Custom objective function.
    feval : function
        Custom evaluation function.
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds: int
        Activates early stopping. CV error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.
    fpreproc : function
        Preprocessing function that takes (dtrain, dtest, param) and returns
        transformed versions of those.
    as_pandas : bool, default True
        Return pd.DataFrame when pandas is installed.
        If False or pandas is not installed, return np.ndarray
    verbose_eval : bool, int, or None, default None
        Whether to display the progress. If None, progress will be displayed
        when np.ndarray is returned. If True, progress will be displayed at
        boosting stage. If an integer is given, progress will be displayed
        at every given `verbose_eval` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
        Results are not affected, and always contains std.
    seed : int
        Seed used to generate the folds (passed to numpy.random.seed).
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks by using xgb.callback module.
        Example: [xgb.callback.reset_learning_rate(custom_rates)]
     shuffle : bool
        Shuffle data before creating folds.

    Returns
    -------
    evaluation history : list(string)
    """
    if stratified is True and not SKLEARN_INSTALLED:
        raise XGBoostError(
            'sklearn needs to be installed in order to use stratified cv')

    if isinstance(metrics, str):
        metrics = [metrics]

    if isinstance(params, list):
        _metrics = [x[1] for x in params if x[0] == 'eval_metric']
        params = dict(params)
        if 'eval_metric' in params:
            params['eval_metric'] = _metrics
    else:
        params = dict((k, v) for k, v in params.items())

    if len(metrics) == 0 and 'eval_metric' in params:
        if isinstance(params['eval_metric'], list):
            metrics = params['eval_metric']
        else:
            metrics = [params['eval_metric']]

    params.pop("eval_metric", None)

    results = {}
    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
                      stratified, folds, shuffle)

    # setup callbacks
    callbacks = [] if callbacks is None else callbacks
    if early_stopping_rounds is not None:
        callbacks.append(
            callback.early_stop(early_stopping_rounds,
                                maximize=maximize,
                                verbose=False))

    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
    else:
        if isinstance(verbose_eval, int):
            callbacks.append(
                callback.print_evaluation(verbose_eval, show_stdv=show_stdv))

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)
    ]
    callbacks_after_iter = [
        cb for cb in callbacks
        if not cb.__dict__.get('before_iteration', False)
    ]

    for i in range(num_boost_round):
        for cb in callbacks_before_iter:
            cb(
                CallbackEnv(model=None,
                            cvfolds=cvfolds,
                            iteration=i,
                            begin_iteration=0,
                            end_iteration=num_boost_round,
                            rank=0,
                            evaluation_result_list=None))
        for fold in cvfolds:
            fold.update(i, obj)
        res = aggcv([f.eval(i, feval) for f in cvfolds])

        for key, mean, std in res:
            if key + '-mean' not in results:
                results[key + '-mean'] = []
            if key + '-std' not in results:
                results[key + '-std'] = []
            results[key + '-mean'].append(mean)
            results[key + '-std'].append(std)
        try:
            for cb in callbacks_after_iter:
                cb(
                    CallbackEnv(model=None,
                                cvfolds=cvfolds,
                                iteration=i,
                                begin_iteration=0,
                                end_iteration=num_boost_round,
                                rank=0,
                                evaluation_result_list=res))
        except EarlyStopException as e:
            for k in results.keys():
                results[k] = results[k][:(e.best_iteration + 1)]
            break
    if as_pandas:
        try:
            import pandas as pd
            results = pd.DataFrame.from_dict(results)
        except ImportError:
            pass
    return (results, cvfolds)
Ejemplo n.º 8
0
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
          maximize=False, early_stopping_rounds=None, evals_result=None,
          verbose_eval=True, xgb_model=None, callbacks=None,
          learning_rates=None):
    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
    """Train a booster with given parameters.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : DMatrix
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
    evals: list of pairs (DMatrix, string)
        List of items to be evaluated during training, this allows user to watch
        performance on the validation set.
    obj : function
        Customized objective function.
    feval : function
        Customized evaluation function.
    maximize : bool
        Whether to maximize feval.
    early_stopping_rounds: int
        Activates early stopping. Validation error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue training.
        Requires at least one item in evals.
        If there's more than one, will use the last.
        Returns the model from the last iteration (not the best one).
        If early stopping occurs, the model will have three additional fields:
        bst.best_score, bst.best_iteration and bst.best_ntree_limit.
        (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
        and/or num_class appears in the parameters)
    evals_result: dict
        This dictionary stores the evaluation results of all the items in watchlist.
        Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
        a parameter containing ('eval_metric': 'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953']},
                  'eval': {'logloss': ['0.480385', '0.357756']}}
    verbose_eval : bool or int
        Requires at least one item in evals.
        If `verbose_eval` is True then the evaluation metric on the validation set is
        printed at each boosting stage.
        If `verbose_eval` is an integer then the evaluation metric on the validation set
        is printed at every given `verbose_eval` boosting stage. The last boosting stage
        / the boosting stage found by using `early_stopping_rounds` is also printed.
        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function (deprecated - use callback API instead)
        List of learning rate for each boosting round
        or a customized function that calculates eta in terms of
        current number of round and the total number of boosting round (e.g. yields
        learning rate decay)
    xgb_model : file name of stored xgb model or 'Booster' instance
        Xgb model to be loaded before training (allows training continuation).
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks by using xgb.callback module.
        Example: [xgb.callback.reset_learning_rate(custom_rates)]

    Returns
    -------
    booster : a trained booster model
    """
    callbacks = [] if callbacks is None else callbacks

    # Most of legacy advanced options becomes callbacks
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation())
    else:
        if isinstance(verbose_eval, int):
            callbacks.append(callback.print_evaluation(verbose_eval))

    if early_stopping_rounds is not None:
        callbacks.append(callback.early_stop(early_stopping_rounds,
                                             maximize=maximize,
                                             verbose=bool(verbose_eval)))
    if evals_result is not None:
        callbacks.append(callback.record_evaluation(evals_result))

    if learning_rates is not None:
        warnings.warn(
            "learning_rates parameter is deprecated - use callback API instead",
            DeprecationWarning)
        callbacks.append(callback.reset_learning_rate(learning_rates))

    return _train_internal(params, dtrain,
                           num_boost_round=num_boost_round,
                           evals=evals,
                           obj=obj, feval=feval,
                           xgb_model=xgb_model, callbacks=callbacks)