Beispiel #1
0
    def __init__(self,
                 booster: Booster,
                 tree_index: int,
                 x_data,
                 y_data,
                 feature_names: List[str] = None,
                 target_name: str = None,
                 class_names: (List[str], Mapping[int, str]) = None):
        if hasattr(booster, 'get_booster'):
            booster = booster.get_booster(
            )  # support XGBClassifier and XGBRegressor
        utils.check_tree_index(tree_index, len(booster.get_dump()))
        self.booster = booster
        self.tree_index = tree_index
        self.tree_to_dataframe = self._get_tree_dataframe()
        self.children_left = self._calculate_children(
            self.__class__.LEFT_CHILDREN_COLUMN)
        self.children_right = self._calculate_children(
            self.__class__.RIGHT_CHILDREN_COLUMN)
        self.config = json.loads(self.booster.save_config())
        self.node_to_samples = None  # lazy initialized
        self.features = None  # lazy initialized

        super().__init__(booster, x_data, y_data, feature_names, target_name,
                         class_names)
def load_checkpoint(checkpoint_dir, max_try=5):
    """
    :param checkpoint_dir: e.g., /opt/ml/checkpoints
    :param max_try: number of times to try loading checkpoint before giving up.
    :return xgb_model: file path of stored xgb model. None if no checkpoint.
    :return iteration: iterations completed before last checkpoiint.
    """
    if not checkpoint_dir or not os.path.exists(checkpoint_dir):
        return None, 0

    regex = r"^{0}\.[0-9]+$".format(CHECKPOINT_FILENAME)
    checkpoints = [f for f in os.listdir(checkpoint_dir) if re.match(regex, f)]
    if not checkpoints:
        return None, 0
    checkpoints.sort()

    xgb_model, iteration = None, 0

    for _ in range(max_try):
        try:
            latest_checkpoint = checkpoints.pop()
            xgb_model = os.path.join(checkpoint_dir, latest_checkpoint)
            booster = Booster()
            booster.load_model(xgb_model)

            filename, extension = latest_checkpoint.split('.')
            iteration = int(extension) + 1
            break
        except XGBoostError:
            logging.debug("Wrong checkpoint model format %s",
                          latest_checkpoint)

    return xgb_model, iteration
Beispiel #3
0
class BreastCancerTrainable(Trainable):
    def setup(self, config):
        self.config = config
        self.nthread = config.pop("nthread", 1)
        self.model: xgb.Booster = None
        # Load dataset
        data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
        # Split into train and test set
        train_x, test_x, train_y, test_y = train_test_split(
            data, labels, test_size=0.25
        )
        # Build input matrices for XGBoost
        self.train_set = xgb.DMatrix(train_x, label=train_y)
        self.test_set = xgb.DMatrix(test_x, label=test_y)

    def step(self):
        # you can also obtain current trial resources:
        current_resources = self.trial_resources
        if isinstance(current_resources, PlacementGroupFactory):
            self.nthread = current_resources.head_cpus
        else:
            self.nthread = current_resources.cpu

        results = {}
        config = self.config.copy()
        config["nthread"] = int(self.nthread)
        self.model = xgb.train(
            config,
            self.train_set,
            evals=[(self.test_set, "eval")],
            verbose_eval=False,
            xgb_model=self.model,
            evals_result=results,
            num_boost_round=1,
        )
        print(config, results)
        return {"eval-logloss": results["eval"]["logloss"][-1], "nthread": self.nthread}

    def save_checkpoint(self, checkpoint_dir):
        path = os.path.join(checkpoint_dir, "checkpoint")
        with open(path, "wb") as outputFile:
            pickle.dump((self.config, self.nthread, self.model.save_raw()), outputFile)
        return path

    def load_checkpoint(self, checkpoint_path):
        with open(checkpoint_path, "rb") as inputFile:
            self.config, self.nthread, raw_model = pickle.load(inputFile)
        self.model = Booster()
        self.model.load_model(bytearray(raw_model))
        data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
        # Split into train and test set
        train_x, test_x, train_y, test_y = train_test_split(
            data, labels, test_size=0.25
        )
        # Build input matrices for XGBoost
        self.train_set = xgb.DMatrix(train_x, label=train_y)
        self.test_set = xgb.DMatrix(test_x, label=test_y)
Beispiel #4
0
def deserialize_booster(ser_model_string):
    """
    Deserialize an xgboost.core.Booster from the input ser_model_string.
    """
    booster = Booster()
    # TODO: change to use string io
    tmp_file_name = os.path.join(_get_or_create_tmp_dir(),
                                 f"{uuid.uuid4()}.json")
    with open(tmp_file_name, "w", encoding="utf-8") as f:
        f.write(ser_model_string)
    booster.load_model(tmp_file_name)
    return booster
Beispiel #5
0
 def load_checkpoint(self, checkpoint_path):
     with open(checkpoint_path, "rb") as inputFile:
         self.config, self.nthread, raw_model = pickle.load(inputFile)
     self.model = Booster()
     self.model.load_model(bytearray(raw_model))
     data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
     # Split into train and test set
     train_x, test_x, train_y, test_y = train_test_split(data,
                                                         labels,
                                                         test_size=0.25)
     # Build input matrices for XGBoost
     self.train_set = xgb.DMatrix(train_x, label=train_y)
     self.test_set = xgb.DMatrix(test_x, label=test_y)
Beispiel #6
0
class CVPack(object):
    """"Auxiliary datastruct to hold one fold of CV."""
    def __init__(self, dtrain, dtest, param):
        """"Initialize the CVPack"""
        self.dtrain = dtrain
        self.dtest = dtest
        self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
        self.bst = Booster(param, [dtrain, dtest])

    def update(self, iteration, fobj):
        """"Update the boosters for one iteration"""
        self.bst.update(self.dtrain, iteration, fobj)

    def eval(self, iteration, feval):
        """"Evaluate the CVPack for one iteration."""
        return self.bst.eval_set(self.watchlist, iteration, feval)
 def __setstate__(self, state):
     # backward compatiblity code
     # load booster from raw if it is raw
     # the booster now support pickle
     bst = state["_Booster"]
     if bst is not None and not isinstance(bst, Booster):
         state["_Booster"] = Booster(model_file=bst)
     self.__dict__.update(state)
Beispiel #8
0
def worker(model, h, v, output_path, proc_que, out_q):
    params = {'nthread': 1}
    model = Booster(params=params, model_file=model)

    while True:
        chip_x, chip_y = proc_que.get()

        if chip_x == 'kill':
            log.debug('Received kill')
            out_q.put('killed')
            break

        outfile = os.path.join(
            output_path,
            'H{:02d}V{:02d}_{}_{}_class.p'.format(h, v, chip_x, chip_y))

        if os.path.exists(outfile):
            log.debug('Output exists, skipping')

        try:
            log.debug('Getting JSON data')
            jn = get_jsonchip(h, v, chip_x, chip_y).flatten()

            log.debug('Getting Aux data')
            aux = get_aux(chip_x,
                          chip_y,
                          exclude=['trends', 'nlcd2001', 'nlcd2011'])

            log.debug('Classifying')
            results = []
            for i in range(jn.shape[0]):
                ccd_models = unpack_result(jn[i])

                dat = np.hstack((
                    aux['dem'][i],
                    aux['aspect'][i],
                    aux['slope'][i],
                    aux['posidex'][i],
                    aux['mpw'][i],
                ))

                res = []
                for ccd_model in ccd_models:
                    res.extend(classifyccd(ccd_model, model, dat))

                results.append(res)
            log.debug('Saving file {}'.format(outfile))
            pickle.dump(results, open(outfile, 'wb'))
        except Exception as e:
            log.debug('EXCEPTION with {} {}'.format(chip_x, chip_y))
            log.exception(e)
Beispiel #9
0
 def _create_checkpoint(model: Booster, epoch: int, filename: str, frequency: int):
     if epoch % frequency > 0 or (not epoch and frequency > 1):
         # Skip 0th checkpoint if frequency > 1
         return
     with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
         model.save_model(os.path.join(checkpoint_dir, filename))
Beispiel #10
0
def _train_internal(params,
                    dtrain,
                    num_boost_round=10,
                    evals=(),
                    obj=None,
                    feval=None,
                    xgb_model=None,
                    callbacks=None):
    """internal training function"""
    callbacks = [] if callbacks is None else callbacks
    evals = list(evals)
    if isinstance(params, dict) \
            and 'eval_metric' in params \
            and isinstance(params['eval_metric'], list):
        params = dict((k, v) for k, v in params.items())
        eval_metrics = params['eval_metric']
        params.pop("eval_metric", None)
        params = list(params.items())
        for eval_metric in eval_metrics:
            params += [('eval_metric', eval_metric)]

    bst = Booster(params, [dtrain] + [d[0] for d in evals])
    nboost = 0
    num_parallel_tree = 1

    if xgb_model is not None:
        if not isinstance(xgb_model, STRING_TYPES):
            xgb_model = xgb_model.save_raw()
        bst = Booster(params, [dtrain] + [d[0] for d in evals],
                      model_file=xgb_model)
        nboost = len(bst.get_dump())

    _params = dict(params) if isinstance(params, list) else params

    if 'num_parallel_tree' in _params:
        num_parallel_tree = _params['num_parallel_tree']
        nboost //= num_parallel_tree
    if 'num_class' in _params:
        nboost //= _params['num_class']

    # Distributed code: Load the checkpoint from rabit.
    version = bst.load_rabit_checkpoint()
    assert (rabit.get_world_size() != 1 or version == 0)
    rank = rabit.get_rank()
    start_iteration = int(version / 2)
    nboost += start_iteration

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)
    ]
    callbacks_after_iter = [
        cb for cb in callbacks
        if not cb.__dict__.get('before_iteration', False)
    ]

    for i in range(nboost, num_boost_round):
        for cb in callbacks_before_iter:
            cb(
                CallbackEnv(model=bst,
                            cvfolds=None,
                            iteration=i,
                            begin_iteration=start_iteration,
                            end_iteration=num_boost_round,
                            rank=rank,
                            evaluation_result_list=None))
        # Distributed code: need to resume to this point.
        # Skip the first update if it is a recovery step.
        if version % 2 == 0:
            bst.update(dtrain, i, obj)
            bst.save_rabit_checkpoint()
            version += 1

        assert (rabit.get_world_size() == 1
                or version == rabit.version_number())

        nboost += 1
        evaluation_result_list = []
        # check evaluation result.
        if len(evals) != 0:
            bst_eval_set = bst.eval_set(evals, i, feval)
            if isinstance(bst_eval_set, STRING_TYPES):
                msg = bst_eval_set
            else:
                msg = bst_eval_set.decode()
            res = [x.split(':') for x in msg.split()]
            evaluation_result_list = [(k, float(v)) for k, v in res[1:]]
        try:
            for cb in callbacks_after_iter:
                cb(
                    CallbackEnv(model=bst,
                                cvfolds=None,
                                iteration=i,
                                begin_iteration=start_iteration,
                                end_iteration=num_boost_round,
                                rank=rank,
                                evaluation_result_list=evaluation_result_list))
        except EarlyStopException:
            break
        # do checkpoint after evaluation, in case evaluation also updates booster.
        bst.save_rabit_checkpoint()
        version += 1

    if bst.attr('best_score') is not None:
        bst.best_score = float(bst.attr('best_score'))
        bst.best_iteration = int(bst.attr('best_iteration'))
    else:
        bst.best_iteration = nboost - 1
    bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
    return bst
Beispiel #11
0
 def __init__(self, dtrain, dtest, param):
     """"Initialize the CVPack"""
     self.dtrain = dtrain
     self.dtest = dtest
     self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
     self.bst = Booster(param, [dtrain, dtest])
Beispiel #12
0
def _train_internal(params, dtrain,
                    num_boost_round=10, evals=(),
                    obj=None, feval=None,
                    xgb_model=None, callbacks=None):
    """internal training function"""
    callbacks = [] if callbacks is None else callbacks
    evals = list(evals)
    if isinstance(params, dict) \
            and 'eval_metric' in params \
            and isinstance(params['eval_metric'], list):
        params = dict((k, v) for k, v in params.items())
        eval_metrics = params['eval_metric']
        params.pop("eval_metric", None)
        params = list(params.items())
        for eval_metric in eval_metrics:
            params += [('eval_metric', eval_metric)]

    bst = Booster(params, [dtrain] + [d[0] for d in evals])
    nboost = 0
    num_parallel_tree = 1

    if xgb_model is not None:
        if not isinstance(xgb_model, STRING_TYPES):
            xgb_model = xgb_model.save_raw()
        bst = Booster(params, [dtrain] + [d[0] for d in evals],
                      model_file=xgb_model)
        nboost = len(bst.get_dump())

    _params = dict(params) if isinstance(params, list) else params

    if 'num_parallel_tree' in _params:
        num_parallel_tree = _params['num_parallel_tree']
        nboost //= num_parallel_tree
    if 'num_class' in _params:
        nboost //= _params['num_class']

    # Distributed code: Load the checkpoint from rabit.
    version = bst.load_rabit_checkpoint()
    assert (rabit.get_world_size() != 1 or version == 0)
    rank = rabit.get_rank()
    start_iteration = int(version / 2)
    nboost += start_iteration

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
    callbacks_after_iter = [
        cb for cb in callbacks if
        not cb.__dict__.get('before_iteration', False)]

    for i in range(nboost, num_boost_round):
        for cb in callbacks_before_iter:
            cb(CallbackEnv(model=bst,
                           cvfolds=None,
                           iteration=i,
                           begin_iteration=start_iteration,
                           end_iteration=num_boost_round,
                           rank=rank,
                           evaluation_result_list=None))
        # Distributed code: need to resume to this point.
        # Skip the first update if it is a recovery step.
        if version % 2 == 0:
            bst.update(dtrain, i, obj)
            bst.save_rabit_checkpoint()
            version += 1

        assert (
        rabit.get_world_size() == 1 or version == rabit.version_number())

        nboost += 1
        evaluation_result_list = []
        # check evaluation result.
        if len(evals) != 0:
            bst_eval_set = bst.eval_set(evals, i, feval)
            if isinstance(bst_eval_set, STRING_TYPES):
                msg = bst_eval_set
            else:
                msg = bst_eval_set.decode()
            res = [x.split(':') for x in msg.split()]
            evaluation_result_list = [(k, float(v)) for k, v in res[1:]]
        try:
            for cb in callbacks_after_iter:
                cb(CallbackEnv(model=bst,
                               cvfolds=None,
                               iteration=i,
                               begin_iteration=start_iteration,
                               end_iteration=num_boost_round,
                               rank=rank,
                               evaluation_result_list=evaluation_result_list))
        except EarlyStopException:
            break
        # do checkpoint after evaluation, in case evaluation also updates booster.
        bst.save_rabit_checkpoint()
        version += 1

    if bst.attr('best_score') is not None:
        bst.best_score = float(bst.attr('best_score'))
        bst.best_iteration = int(bst.attr('best_iteration'))
    else:
        bst.best_iteration = nboost - 1
    bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
    return bst
Beispiel #13
0
if _pandas:
    from xgboost.core import Booster, DMatrix
else:
    sys.modules['pandas'] = None
    from xgboost.core import Booster, DMatrix
    del sys.modules['pandas']

DEBUG = False
MODEL_NAME = 'v12.xgb'
PredictionRow = namedtuple('PredictionRow', ('c', 't', 's', 'p'))

handles = partial(register_handler, 'editor')
doc_generator = DocGenerator()

with resources.path('akimous.resources', MODEL_NAME) as _path:
    model = Booster(model_file=str(_path))  # 3 ms
    model.set_param('nthread', 1)
logger.info('Model %s loaded.', MODEL_NAME)


def get_relative_path(context):
    try:
        return tuple(
            context.path.relative_to(context.shared.project_root).parts)
    except ValueError:
        # the file does not belong to the project folder
        return tuple(context.path.parts)


async def run_pylint(context, send):
    if not config['linter']['pylint']: