def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix,
              model_dir, checkpoint_dir, is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param train_val_dmatrix: Training + Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for train() API
    num_round = train_cfg.pop("num_round")
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.pop("_tuning_objective_metric",
                                                  None)
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval, tuning_objective_metric = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    early_stopping_rounds = train_cfg.pop('early_stopping_rounds', None)
    early_stopping_data_name = 'validation' if val_dmatrix else None
    early_stopping_metric = None
    if early_stopping_rounds:
        if tuning_objective_metric:
            early_stopping_metric = tuning_objective_metric[-1]
        elif eval_metric:
            early_stopping_metric = eval_metric[-1]

    logging.info("Train matrix has {} rows and {} columns".format(
        train_dmatrix.num_row(), train_dmatrix.num_col()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        kfold = train_cfg.pop("_kfold", None)

        if kfold is None:
            xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist(
                train_dmatrix=train_dmatrix,
                val_dmatrix=val_dmatrix,
                model_dir=model_dir,
                checkpoint_dir=checkpoint_dir,
                early_stopping_data_name=early_stopping_data_name,
                early_stopping_metric=early_stopping_metric,
                early_stopping_rounds=early_stopping_rounds,
                save_model_on_termination=save_model_on_termination,
                is_master=is_master)
            add_debugging(callbacks=callbacks,
                          hyperparameters=train_cfg,
                          train_dmatrix=train_dmatrix,
                          val_dmatrix=val_dmatrix)

            bst = xgb.train(train_cfg,
                            train_dmatrix,
                            num_boost_round=num_round - iteration,
                            evals=watchlist,
                            feval=configured_feval,
                            callbacks=callbacks,
                            xgb_model=xgb_model,
                            verbose_eval=False)

        else:
            num_cv_round = train_cfg.pop("_num_cv_round", 1)
            logging.info(
                "Run {}-round of {}-fold cross validation with {} rows".format(
                    num_cv_round, kfold, train_val_dmatrix.num_row()))

            bst = []
            evals_results = []

            num_class = train_cfg.get("num_class", None)
            objective = train_cfg.get("objective", None)
            # RepeatedStratifiedKFold expects X as array-like of shape (n_samples, n_features)
            X = range(train_val_dmatrix.num_row())
            y = train_val_dmatrix.get_label(
            ) if num_class or objective.startswith("binary:") else None
            rkf = RepeatedStratifiedKFold(n_splits=kfold, n_repeats=num_cv_round) if y is not None \
                else RepeatedKFold(n_splits=kfold, n_repeats=num_cv_round)

            for train_index, val_index in rkf.split(X=X, y=y):
                cv_train_dmatrix = train_val_dmatrix.slice(train_index)
                cv_val_dmatrix = train_val_dmatrix.slice(val_index)

                xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist(
                    train_dmatrix=cv_train_dmatrix,
                    val_dmatrix=cv_val_dmatrix,
                    model_dir=model_dir,
                    checkpoint_dir=checkpoint_dir,
                    early_stopping_data_name=early_stopping_data_name,
                    early_stopping_metric=early_stopping_metric,
                    early_stopping_rounds=early_stopping_rounds,
                    save_model_on_termination=save_model_on_termination,
                    is_master=is_master,
                    fold=len(bst))
                add_debugging(callbacks=callbacks,
                              hyperparameters=train_cfg,
                              train_dmatrix=cv_train_dmatrix,
                              val_dmatrix=cv_val_dmatrix)

                evals_result = {}
                logging.info(
                    "Train cross validation fold {}".format((len(bst) %
                                                             kfold) + 1))
                booster = xgb.train(train_cfg,
                                    cv_train_dmatrix,
                                    num_boost_round=num_round - iteration,
                                    evals=watchlist,
                                    feval=configured_feval,
                                    evals_result=evals_result,
                                    callbacks=callbacks,
                                    xgb_model=xgb_model,
                                    verbose_eval=False)
                bst.append(booster)
                evals_results.append(evals_result)

                if len(bst) % kfold == 0:
                    logging.info(
                        "The metrics of round {} cross validation".format(
                            int(len(bst) / kfold)))
                    print_cv_metric(num_round, evals_results[-kfold:])

            if num_cv_round > 1:
                logging.info(
                    "The overall metrics of {}-round cross validation".format(
                        num_cv_round))
                print_cv_metric(num_round, evals_results)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        if type(bst) is not list:
            model_location = os.path.join(model_dir, MODEL_NAME)
            bst.save_model(model_location)
            logging.debug("Stored trained model at {}".format(model_location))
        else:
            for fold in range(len(bst)):
                model_location = os.path.join(model_dir,
                                              f"{MODEL_NAME}-{fold}")
                bst[fold].save_model(model_location)
                logging.debug("Stored trained model {} at {}".format(
                    fold, model_location))
def train_job(train_cfg, train_dmatrix, val_dmatrix, model_dir, checkpoint_dir,
              is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Parse arguments for train() API
    early_stopping_rounds = train_cfg.get('early_stopping_rounds')
    num_round = train_cfg.pop("num_round")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    # Set callback evals
    watchlist = [(train_dmatrix, 'train')]
    if val_dmatrix is not None:
        watchlist.append((val_dmatrix, 'validation'))

    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
    num_round -= iteration
    if xgb_model is not None:
        logging.info("Checkpoint loaded from %s", xgb_model)
        logging.info("Resuming from iteration %s", iteration)

    callbacks = []
    callbacks.append(
        checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
    if checkpoint_dir:
        save_checkpoint = checkpointing.save_checkpoint(
            checkpoint_dir, start_iteration=iteration)
        callbacks.append(save_checkpoint)

    if save_model_on_termination == "true":
        save_intermediate_model = checkpointing.save_intermediate_model(
            model_dir, MODEL_NAME)
        callbacks.append(save_intermediate_model)
        add_sigterm_handler(model_dir, is_master)

    add_debugging(callbacks=callbacks,
                  hyperparameters=train_cfg,
                  train_dmatrix=train_dmatrix,
                  val_dmatrix=val_dmatrix)

    logging.info("Train matrix has {} rows".format(train_dmatrix.num_row()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        bst = xgb.train(train_cfg,
                        train_dmatrix,
                        num_boost_round=num_round,
                        evals=watchlist,
                        feval=configured_feval,
                        early_stopping_rounds=early_stopping_rounds,
                        callbacks=callbacks,
                        xgb_model=xgb_model,
                        verbose_eval=False)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        model_location = model_dir + '/xgboost-model'
        with open(model_location, 'wb') as f:
            pkl.dump(bst, f, protocol=4)
        logging.debug("Stored trained model at {}".format(model_location))
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix,
              model_dir, checkpoint_dir, is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param train_val_dmatrix: Training + Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Parse arguments for train() API
    early_stopping_rounds = train_cfg.get('early_stopping_rounds')
    num_round = train_cfg.pop("num_round")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    # Set callback evals
    watchlist = [(train_dmatrix, 'train')]
    if val_dmatrix is not None:
        watchlist.append((val_dmatrix, 'validation'))

    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
    num_round -= iteration
    if xgb_model is not None:
        logging.info("Checkpoint loaded from %s", xgb_model)
        logging.info("Resuming from iteration %s", iteration)

    callbacks = []
    callbacks.append(
        checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
    if checkpoint_dir:
        save_checkpoint = checkpointing.save_checkpoint(
            checkpoint_dir, start_iteration=iteration)
        callbacks.append(save_checkpoint)

    if save_model_on_termination == "true":
        save_intermediate_model = checkpointing.save_intermediate_model(
            model_dir, MODEL_NAME)
        callbacks.append(save_intermediate_model)
        add_sigterm_handler(model_dir, is_master)

    add_debugging(callbacks=callbacks,
                  hyperparameters=train_cfg,
                  train_dmatrix=train_dmatrix,
                  val_dmatrix=val_dmatrix)

    logging.info("Train matrix has {} rows and {} columns".format(
        train_dmatrix.num_row(), train_dmatrix.num_col()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        nfold = train_cfg.pop("_nfold", None)

        bst = xgb.train(train_cfg,
                        train_dmatrix,
                        num_boost_round=num_round,
                        evals=watchlist,
                        feval=configured_feval,
                        early_stopping_rounds=early_stopping_rounds,
                        callbacks=callbacks,
                        xgb_model=xgb_model,
                        verbose_eval=False)

        if nfold is not None and train_val_dmatrix is not None:
            logging.info(
                "Run {}-fold cross validation on the data of {} rows".format(
                    nfold, train_val_dmatrix.num_row()))
            # xgb.cv returns a pandas data frame of evaluation results.
            cv_eval_result = xgb.cv(
                train_cfg,
                train_val_dmatrix,
                nfold=nfold,
                num_boost_round=num_round,
                feval=configured_feval,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=True,
                show_stdv=True,
                shuffle=False)

            logging.info("The final metrics of cross validation")
            cv_last_epoch = len(cv_eval_result.index) - 1
            cv_eval_report = f"[{cv_last_epoch}]"
            cv_eval_columns = cv_eval_result.columns
            # Skip the standard deviation columns
            for j in range(0, len(cv_eval_columns), 2):
                metric_name = cv_eval_columns[j][:-5].replace(
                    "test-", "validation-", 1)
                metric_val = cv_eval_result.at[cv_last_epoch,
                                               cv_eval_columns[j]]
                cv_eval_report += '\t{0}:{1:.5f}'.format(
                    metric_name, metric_val)
            print(cv_eval_report)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        model_location = model_dir + '/xgboost-model'
        with open(model_location, 'wb') as f:
            pkl.dump(bst, f, protocol=4)
        logging.debug("Stored trained model at {}".format(model_location))