class CVPack(object): """"Auxiliary datastruct to hold one fold of CV.""" def __init__(self, dtrain, dtest, param): """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) def update(self, iteration, fobj): """"Update the boosters for one iteration""" self.bst.update(self.dtrain, iteration, fobj) def eval(self, iteration, feval): """"Evaluate the CVPack for one iteration.""" return self.bst.eval_set(self.watchlist, iteration, feval)
def _train_internal(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, xgb_model=None, callbacks=None): """internal training function""" callbacks = [] if callbacks is None else callbacks evals = list(evals) if isinstance(params, dict) \ and 'eval_metric' in params \ and isinstance(params['eval_metric'], list): params = dict((k, v) for k, v in params.items()) eval_metrics = params['eval_metric'] params.pop("eval_metric", None) params = list(params.items()) for eval_metric in eval_metrics: params += [('eval_metric', eval_metric)] bst = Booster(params, [dtrain] + [d[0] for d in evals]) nboost = 0 num_parallel_tree = 1 if xgb_model is not None: if not isinstance(xgb_model, STRING_TYPES): xgb_model = xgb_model.save_raw() bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) nboost = len(bst.get_dump()) _params = dict(params) if isinstance(params, list) else params if 'num_parallel_tree' in _params: num_parallel_tree = _params['num_parallel_tree'] nboost //= num_parallel_tree if 'num_class' in _params: nboost //= _params['num_class'] # Distributed code: Load the checkpoint from rabit. version = bst.load_rabit_checkpoint() assert (rabit.get_world_size() != 1 or version == 0) rank = rabit.get_rank() start_iteration = int(version / 2) nboost += start_iteration callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False) ] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False) ] for i in range(nboost, num_boost_round): for cb in callbacks_before_iter: cb( CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=None)) # Distributed code: need to resume to this point. # Skip the first update if it is a recovery step. if version % 2 == 0: bst.update(dtrain, i, obj) bst.save_rabit_checkpoint() version += 1 assert (rabit.get_world_size() == 1 or version == rabit.version_number()) nboost += 1 evaluation_result_list = [] # check evaluation result. if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() res = [x.split(':') for x in msg.split()] evaluation_result_list = [(k, float(v)) for k, v in res[1:]] try: for cb in callbacks_after_iter: cb( CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=evaluation_result_list)) except EarlyStopException: break # do checkpoint after evaluation, in case evaluation also updates booster. bst.save_rabit_checkpoint() version += 1 if bst.attr('best_score') is not None: bst.best_score = float(bst.attr('best_score')) bst.best_iteration = int(bst.attr('best_iteration')) else: bst.best_iteration = nboost - 1 bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree return bst
def _train_internal(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, xgb_model=None, callbacks=None): """internal training function""" callbacks = [] if callbacks is None else callbacks evals = list(evals) if isinstance(params, dict) \ and 'eval_metric' in params \ and isinstance(params['eval_metric'], list): params = dict((k, v) for k, v in params.items()) eval_metrics = params['eval_metric'] params.pop("eval_metric", None) params = list(params.items()) for eval_metric in eval_metrics: params += [('eval_metric', eval_metric)] bst = Booster(params, [dtrain] + [d[0] for d in evals]) nboost = 0 num_parallel_tree = 1 if xgb_model is not None: if not isinstance(xgb_model, STRING_TYPES): xgb_model = xgb_model.save_raw() bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) nboost = len(bst.get_dump()) _params = dict(params) if isinstance(params, list) else params if 'num_parallel_tree' in _params: num_parallel_tree = _params['num_parallel_tree'] nboost //= num_parallel_tree if 'num_class' in _params: nboost //= _params['num_class'] # Distributed code: Load the checkpoint from rabit. version = bst.load_rabit_checkpoint() assert (rabit.get_world_size() != 1 or version == 0) rank = rabit.get_rank() start_iteration = int(version / 2) nboost += start_iteration callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False)] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)] for i in range(nboost, num_boost_round): for cb in callbacks_before_iter: cb(CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=None)) # Distributed code: need to resume to this point. # Skip the first update if it is a recovery step. if version % 2 == 0: bst.update(dtrain, i, obj) bst.save_rabit_checkpoint() version += 1 assert ( rabit.get_world_size() == 1 or version == rabit.version_number()) nboost += 1 evaluation_result_list = [] # check evaluation result. if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() res = [x.split(':') for x in msg.split()] evaluation_result_list = [(k, float(v)) for k, v in res[1:]] try: for cb in callbacks_after_iter: cb(CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=evaluation_result_list)) except EarlyStopException: break # do checkpoint after evaluation, in case evaluation also updates booster. bst.save_rabit_checkpoint() version += 1 if bst.attr('best_score') is not None: bst.best_score = float(bst.attr('best_score')) bst.best_iteration = int(bst.attr('best_iteration')) else: bst.best_iteration = nboost - 1 bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree return bst