def __init__(self, booster: Booster, tree_index: int, x_data, y_data, feature_names: List[str] = None, target_name: str = None, class_names: (List[str], Mapping[int, str]) = None): if hasattr(booster, 'get_booster'): booster = booster.get_booster( ) # support XGBClassifier and XGBRegressor utils.check_tree_index(tree_index, len(booster.get_dump())) self.booster = booster self.tree_index = tree_index self.tree_to_dataframe = self._get_tree_dataframe() self.children_left = self._calculate_children( self.__class__.LEFT_CHILDREN_COLUMN) self.children_right = self._calculate_children( self.__class__.RIGHT_CHILDREN_COLUMN) self.config = json.loads(self.booster.save_config()) self.node_to_samples = None # lazy initialized self.features = None # lazy initialized super().__init__(booster, x_data, y_data, feature_names, target_name, class_names)
def load_checkpoint(checkpoint_dir, max_try=5): """ :param checkpoint_dir: e.g., /opt/ml/checkpoints :param max_try: number of times to try loading checkpoint before giving up. :return xgb_model: file path of stored xgb model. None if no checkpoint. :return iteration: iterations completed before last checkpoiint. """ if not checkpoint_dir or not os.path.exists(checkpoint_dir): return None, 0 regex = r"^{0}\.[0-9]+$".format(CHECKPOINT_FILENAME) checkpoints = [f for f in os.listdir(checkpoint_dir) if re.match(regex, f)] if not checkpoints: return None, 0 checkpoints.sort() xgb_model, iteration = None, 0 for _ in range(max_try): try: latest_checkpoint = checkpoints.pop() xgb_model = os.path.join(checkpoint_dir, latest_checkpoint) booster = Booster() booster.load_model(xgb_model) filename, extension = latest_checkpoint.split('.') iteration = int(extension) + 1 break except XGBoostError: logging.debug("Wrong checkpoint model format %s", latest_checkpoint) return xgb_model, iteration
class BreastCancerTrainable(Trainable): def setup(self, config): self.config = config self.nthread = config.pop("nthread", 1) self.model: xgb.Booster = None # Load dataset data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split( data, labels, test_size=0.25 ) # Build input matrices for XGBoost self.train_set = xgb.DMatrix(train_x, label=train_y) self.test_set = xgb.DMatrix(test_x, label=test_y) def step(self): # you can also obtain current trial resources: current_resources = self.trial_resources if isinstance(current_resources, PlacementGroupFactory): self.nthread = current_resources.head_cpus else: self.nthread = current_resources.cpu results = {} config = self.config.copy() config["nthread"] = int(self.nthread) self.model = xgb.train( config, self.train_set, evals=[(self.test_set, "eval")], verbose_eval=False, xgb_model=self.model, evals_result=results, num_boost_round=1, ) print(config, results) return {"eval-logloss": results["eval"]["logloss"][-1], "nthread": self.nthread} def save_checkpoint(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "wb") as outputFile: pickle.dump((self.config, self.nthread, self.model.save_raw()), outputFile) return path def load_checkpoint(self, checkpoint_path): with open(checkpoint_path, "rb") as inputFile: self.config, self.nthread, raw_model = pickle.load(inputFile) self.model = Booster() self.model.load_model(bytearray(raw_model)) data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split( data, labels, test_size=0.25 ) # Build input matrices for XGBoost self.train_set = xgb.DMatrix(train_x, label=train_y) self.test_set = xgb.DMatrix(test_x, label=test_y)
def deserialize_booster(ser_model_string): """ Deserialize an xgboost.core.Booster from the input ser_model_string. """ booster = Booster() # TODO: change to use string io tmp_file_name = os.path.join(_get_or_create_tmp_dir(), f"{uuid.uuid4()}.json") with open(tmp_file_name, "w", encoding="utf-8") as f: f.write(ser_model_string) booster.load_model(tmp_file_name) return booster
def load_checkpoint(self, checkpoint_path): with open(checkpoint_path, "rb") as inputFile: self.config, self.nthread, raw_model = pickle.load(inputFile) self.model = Booster() self.model.load_model(bytearray(raw_model)) data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) # Build input matrices for XGBoost self.train_set = xgb.DMatrix(train_x, label=train_y) self.test_set = xgb.DMatrix(test_x, label=test_y)
class CVPack(object): """"Auxiliary datastruct to hold one fold of CV.""" def __init__(self, dtrain, dtest, param): """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) def update(self, iteration, fobj): """"Update the boosters for one iteration""" self.bst.update(self.dtrain, iteration, fobj) def eval(self, iteration, feval): """"Evaluate the CVPack for one iteration.""" return self.bst.eval_set(self.watchlist, iteration, feval)
def __setstate__(self, state): # backward compatiblity code # load booster from raw if it is raw # the booster now support pickle bst = state["_Booster"] if bst is not None and not isinstance(bst, Booster): state["_Booster"] = Booster(model_file=bst) self.__dict__.update(state)
def worker(model, h, v, output_path, proc_que, out_q): params = {'nthread': 1} model = Booster(params=params, model_file=model) while True: chip_x, chip_y = proc_que.get() if chip_x == 'kill': log.debug('Received kill') out_q.put('killed') break outfile = os.path.join( output_path, 'H{:02d}V{:02d}_{}_{}_class.p'.format(h, v, chip_x, chip_y)) if os.path.exists(outfile): log.debug('Output exists, skipping') try: log.debug('Getting JSON data') jn = get_jsonchip(h, v, chip_x, chip_y).flatten() log.debug('Getting Aux data') aux = get_aux(chip_x, chip_y, exclude=['trends', 'nlcd2001', 'nlcd2011']) log.debug('Classifying') results = [] for i in range(jn.shape[0]): ccd_models = unpack_result(jn[i]) dat = np.hstack(( aux['dem'][i], aux['aspect'][i], aux['slope'][i], aux['posidex'][i], aux['mpw'][i], )) res = [] for ccd_model in ccd_models: res.extend(classifyccd(ccd_model, model, dat)) results.append(res) log.debug('Saving file {}'.format(outfile)) pickle.dump(results, open(outfile, 'wb')) except Exception as e: log.debug('EXCEPTION with {} {}'.format(chip_x, chip_y)) log.exception(e)
def _create_checkpoint(model: Booster, epoch: int, filename: str, frequency: int): if epoch % frequency > 0 or (not epoch and frequency > 1): # Skip 0th checkpoint if frequency > 1 return with tune.checkpoint_dir(step=epoch) as checkpoint_dir: model.save_model(os.path.join(checkpoint_dir, filename))
def _train_internal(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, xgb_model=None, callbacks=None): """internal training function""" callbacks = [] if callbacks is None else callbacks evals = list(evals) if isinstance(params, dict) \ and 'eval_metric' in params \ and isinstance(params['eval_metric'], list): params = dict((k, v) for k, v in params.items()) eval_metrics = params['eval_metric'] params.pop("eval_metric", None) params = list(params.items()) for eval_metric in eval_metrics: params += [('eval_metric', eval_metric)] bst = Booster(params, [dtrain] + [d[0] for d in evals]) nboost = 0 num_parallel_tree = 1 if xgb_model is not None: if not isinstance(xgb_model, STRING_TYPES): xgb_model = xgb_model.save_raw() bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) nboost = len(bst.get_dump()) _params = dict(params) if isinstance(params, list) else params if 'num_parallel_tree' in _params: num_parallel_tree = _params['num_parallel_tree'] nboost //= num_parallel_tree if 'num_class' in _params: nboost //= _params['num_class'] # Distributed code: Load the checkpoint from rabit. version = bst.load_rabit_checkpoint() assert (rabit.get_world_size() != 1 or version == 0) rank = rabit.get_rank() start_iteration = int(version / 2) nboost += start_iteration callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False) ] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False) ] for i in range(nboost, num_boost_round): for cb in callbacks_before_iter: cb( CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=None)) # Distributed code: need to resume to this point. # Skip the first update if it is a recovery step. if version % 2 == 0: bst.update(dtrain, i, obj) bst.save_rabit_checkpoint() version += 1 assert (rabit.get_world_size() == 1 or version == rabit.version_number()) nboost += 1 evaluation_result_list = [] # check evaluation result. if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() res = [x.split(':') for x in msg.split()] evaluation_result_list = [(k, float(v)) for k, v in res[1:]] try: for cb in callbacks_after_iter: cb( CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=evaluation_result_list)) except EarlyStopException: break # do checkpoint after evaluation, in case evaluation also updates booster. bst.save_rabit_checkpoint() version += 1 if bst.attr('best_score') is not None: bst.best_score = float(bst.attr('best_score')) bst.best_iteration = int(bst.attr('best_iteration')) else: bst.best_iteration = nboost - 1 bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree return bst
def __init__(self, dtrain, dtest, param): """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest])
def _train_internal(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, xgb_model=None, callbacks=None): """internal training function""" callbacks = [] if callbacks is None else callbacks evals = list(evals) if isinstance(params, dict) \ and 'eval_metric' in params \ and isinstance(params['eval_metric'], list): params = dict((k, v) for k, v in params.items()) eval_metrics = params['eval_metric'] params.pop("eval_metric", None) params = list(params.items()) for eval_metric in eval_metrics: params += [('eval_metric', eval_metric)] bst = Booster(params, [dtrain] + [d[0] for d in evals]) nboost = 0 num_parallel_tree = 1 if xgb_model is not None: if not isinstance(xgb_model, STRING_TYPES): xgb_model = xgb_model.save_raw() bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) nboost = len(bst.get_dump()) _params = dict(params) if isinstance(params, list) else params if 'num_parallel_tree' in _params: num_parallel_tree = _params['num_parallel_tree'] nboost //= num_parallel_tree if 'num_class' in _params: nboost //= _params['num_class'] # Distributed code: Load the checkpoint from rabit. version = bst.load_rabit_checkpoint() assert (rabit.get_world_size() != 1 or version == 0) rank = rabit.get_rank() start_iteration = int(version / 2) nboost += start_iteration callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False)] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)] for i in range(nboost, num_boost_round): for cb in callbacks_before_iter: cb(CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=None)) # Distributed code: need to resume to this point. # Skip the first update if it is a recovery step. if version % 2 == 0: bst.update(dtrain, i, obj) bst.save_rabit_checkpoint() version += 1 assert ( rabit.get_world_size() == 1 or version == rabit.version_number()) nboost += 1 evaluation_result_list = [] # check evaluation result. if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set else: msg = bst_eval_set.decode() res = [x.split(':') for x in msg.split()] evaluation_result_list = [(k, float(v)) for k, v in res[1:]] try: for cb in callbacks_after_iter: cb(CallbackEnv(model=bst, cvfolds=None, iteration=i, begin_iteration=start_iteration, end_iteration=num_boost_round, rank=rank, evaluation_result_list=evaluation_result_list)) except EarlyStopException: break # do checkpoint after evaluation, in case evaluation also updates booster. bst.save_rabit_checkpoint() version += 1 if bst.attr('best_score') is not None: bst.best_score = float(bst.attr('best_score')) bst.best_iteration = int(bst.attr('best_iteration')) else: bst.best_iteration = nboost - 1 bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree return bst
if _pandas: from xgboost.core import Booster, DMatrix else: sys.modules['pandas'] = None from xgboost.core import Booster, DMatrix del sys.modules['pandas'] DEBUG = False MODEL_NAME = 'v12.xgb' PredictionRow = namedtuple('PredictionRow', ('c', 't', 's', 'p')) handles = partial(register_handler, 'editor') doc_generator = DocGenerator() with resources.path('akimous.resources', MODEL_NAME) as _path: model = Booster(model_file=str(_path)) # 3 ms model.set_param('nthread', 1) logger.info('Model %s loaded.', MODEL_NAME) def get_relative_path(context): try: return tuple( context.path.relative_to(context.shared.project_root).parts) except ValueError: # the file does not belong to the project folder return tuple(context.path.parts) async def run_pylint(context, send): if not config['linter']['pylint']: