Ejemplo n.º 1
0
    def execute(cls, ctx, op):
        if op.merge:
            return super().execute(ctx, op)

        from xgboost import train, rabit

        dtrain = ToDMatrix.get_xgb_dmatrix(ctx[op.dtrain.key])
        evals = tuple()
        if op.evals is not None:
            eval_dmatrices = [ToDMatrix.get_xgb_dmatrix(ctx[t[0].key]) for t in op.evals]
            evals = tuple((m, ev[1]) for m, ev in zip(eval_dmatrices, op.evals))
        params = op.params
        params['nthread'] = ctx.get_ncores() or -1

        if op.tracker is None:
            # non distributed
            local_history = dict()
            kwargs = dict() if op.kwargs is None else op.kwargs
            bst = train(params, dtrain, evals=evals,
                        evals_result=local_history, **kwargs)
            ctx[op.outputs[0].key] = {'booster': pickle.dumps(bst), 'history': local_history}
        else:
            # distributed
            rabit_args = ctx[op.tracker.key]
            rabit.init(rabit_args)
            try:
                local_history = dict()
                bst = train(params, dtrain, evals=evals, evals_result=local_history,
                            **op.kwargs)
                ret = {'booster': pickle.dumps(bst), 'history': local_history}
                if rabit.get_rank() != 0:
                    ret = {}
                ctx[op.outputs[0].key] = ret
            finally:
                rabit.finalize()
Ejemplo n.º 2
0
    def __init__(self, is_master, current_host, master_port):
        """This is returned by the Rabit context manager for useful cluster information and data synchronization.

        :param is_master:
        :param current_host:
        :param master_port:
        """
        self.is_master = is_master
        self.rank = rabit.get_rank()
        self.current_host = current_host
        self.master_port = master_port
Ejemplo n.º 3
0
    def execute(cls, ctx, op):
        if op.merge:
            return super().execute(ctx, op)

        from xgboost import train, rabit

        dtrain = ToDMatrix.get_xgb_dmatrix(ensure_own_data(ctx[op.dtrain.key]))
        evals = tuple()
        if op.evals is not None:
            eval_dmatrices = [
                ToDMatrix.get_xgb_dmatrix(ensure_own_data(ctx[t[0].key]))
                for t in op.evals
            ]
            evals = tuple(
                (m, ev[1]) for m, ev in zip(eval_dmatrices, op.evals))
        params = op.params

        if op.tracker is None:
            # non distributed
            local_history = dict()
            kwargs = dict() if op.kwargs is None else op.kwargs
            bst = train(params,
                        dtrain,
                        evals=evals,
                        evals_result=local_history,
                        **kwargs)
            ctx[op.outputs[0].key] = {
                'booster': pickle.dumps(bst),
                'history': local_history
            }
        else:
            # distributed
            rabit_args = ctx[op.tracker.key]
            rabit.init([
                arg.tobytes() if isinstance(arg, memoryview) else arg
                for arg in rabit_args
            ])
            try:
                local_history = dict()
                bst = train(params,
                            dtrain,
                            evals=evals,
                            evals_result=local_history,
                            **op.kwargs)
                ret = {'booster': pickle.dumps(bst), 'history': local_history}
                if rabit.get_rank() != 0:
                    ret = {}
                ctx[op.outputs[0].key] = ret
            finally:
                rabit.finalize()
Ejemplo n.º 4
0
def _train_internal(params,
                    dtrain,
                    num_boost_round=10,
                    evals=(),
                    obj=None,
                    feval=None,
                    xgb_model=None,
                    callbacks=None):
    """internal training function"""
    callbacks = [] if callbacks is None else callbacks
    evals = list(evals)
    if isinstance(params, dict) \
            and 'eval_metric' in params \
            and isinstance(params['eval_metric'], list):
        params = dict((k, v) for k, v in params.items())
        eval_metrics = params['eval_metric']
        params.pop("eval_metric", None)
        params = list(params.items())
        for eval_metric in eval_metrics:
            params += [('eval_metric', eval_metric)]

    bst = Booster(params, [dtrain] + [d[0] for d in evals])
    nboost = 0
    num_parallel_tree = 1

    if xgb_model is not None:
        if not isinstance(xgb_model, STRING_TYPES):
            xgb_model = xgb_model.save_raw()
        bst = Booster(params, [dtrain] + [d[0] for d in evals],
                      model_file=xgb_model)
        nboost = len(bst.get_dump())

    _params = dict(params) if isinstance(params, list) else params

    if 'num_parallel_tree' in _params:
        num_parallel_tree = _params['num_parallel_tree']
        nboost //= num_parallel_tree
    if 'num_class' in _params:
        nboost //= _params['num_class']

    # Distributed code: Load the checkpoint from rabit.
    version = bst.load_rabit_checkpoint()
    assert (rabit.get_world_size() != 1 or version == 0)
    rank = rabit.get_rank()
    start_iteration = int(version / 2)
    nboost += start_iteration

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)
    ]
    callbacks_after_iter = [
        cb for cb in callbacks
        if not cb.__dict__.get('before_iteration', False)
    ]

    for i in range(nboost, num_boost_round):
        for cb in callbacks_before_iter:
            cb(
                CallbackEnv(model=bst,
                            cvfolds=None,
                            iteration=i,
                            begin_iteration=start_iteration,
                            end_iteration=num_boost_round,
                            rank=rank,
                            evaluation_result_list=None))
        # Distributed code: need to resume to this point.
        # Skip the first update if it is a recovery step.
        if version % 2 == 0:
            bst.update(dtrain, i, obj)
            bst.save_rabit_checkpoint()
            version += 1

        assert (rabit.get_world_size() == 1
                or version == rabit.version_number())

        nboost += 1
        evaluation_result_list = []
        # check evaluation result.
        if len(evals) != 0:
            bst_eval_set = bst.eval_set(evals, i, feval)
            if isinstance(bst_eval_set, STRING_TYPES):
                msg = bst_eval_set
            else:
                msg = bst_eval_set.decode()
            res = [x.split(':') for x in msg.split()]
            evaluation_result_list = [(k, float(v)) for k, v in res[1:]]
        try:
            for cb in callbacks_after_iter:
                cb(
                    CallbackEnv(model=bst,
                                cvfolds=None,
                                iteration=i,
                                begin_iteration=start_iteration,
                                end_iteration=num_boost_round,
                                rank=rank,
                                evaluation_result_list=evaluation_result_list))
        except EarlyStopException:
            break
        # do checkpoint after evaluation, in case evaluation also updates booster.
        bst.save_rabit_checkpoint()
        version += 1

    if bst.attr('best_score') is not None:
        bst.best_score = float(bst.attr('best_score'))
        bst.best_iteration = int(bst.attr('best_iteration'))
    else:
        bst.best_iteration = nboost - 1
    bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
    return bst
Ejemplo n.º 5
0
def _train_internal(params, dtrain,
                    num_boost_round=10, evals=(),
                    obj=None, feval=None,
                    xgb_model=None, callbacks=None):
    """internal training function"""
    callbacks = [] if callbacks is None else callbacks
    evals = list(evals)
    if isinstance(params, dict) \
            and 'eval_metric' in params \
            and isinstance(params['eval_metric'], list):
        params = dict((k, v) for k, v in params.items())
        eval_metrics = params['eval_metric']
        params.pop("eval_metric", None)
        params = list(params.items())
        for eval_metric in eval_metrics:
            params += [('eval_metric', eval_metric)]

    bst = Booster(params, [dtrain] + [d[0] for d in evals])
    nboost = 0
    num_parallel_tree = 1

    if xgb_model is not None:
        if not isinstance(xgb_model, STRING_TYPES):
            xgb_model = xgb_model.save_raw()
        bst = Booster(params, [dtrain] + [d[0] for d in evals],
                      model_file=xgb_model)
        nboost = len(bst.get_dump())

    _params = dict(params) if isinstance(params, list) else params

    if 'num_parallel_tree' in _params:
        num_parallel_tree = _params['num_parallel_tree']
        nboost //= num_parallel_tree
    if 'num_class' in _params:
        nboost //= _params['num_class']

    # Distributed code: Load the checkpoint from rabit.
    version = bst.load_rabit_checkpoint()
    assert (rabit.get_world_size() != 1 or version == 0)
    rank = rabit.get_rank()
    start_iteration = int(version / 2)
    nboost += start_iteration

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
    callbacks_after_iter = [
        cb for cb in callbacks if
        not cb.__dict__.get('before_iteration', False)]

    for i in range(nboost, num_boost_round):
        for cb in callbacks_before_iter:
            cb(CallbackEnv(model=bst,
                           cvfolds=None,
                           iteration=i,
                           begin_iteration=start_iteration,
                           end_iteration=num_boost_round,
                           rank=rank,
                           evaluation_result_list=None))
        # Distributed code: need to resume to this point.
        # Skip the first update if it is a recovery step.
        if version % 2 == 0:
            bst.update(dtrain, i, obj)
            bst.save_rabit_checkpoint()
            version += 1

        assert (
        rabit.get_world_size() == 1 or version == rabit.version_number())

        nboost += 1
        evaluation_result_list = []
        # check evaluation result.
        if len(evals) != 0:
            bst_eval_set = bst.eval_set(evals, i, feval)
            if isinstance(bst_eval_set, STRING_TYPES):
                msg = bst_eval_set
            else:
                msg = bst_eval_set.decode()
            res = [x.split(':') for x in msg.split()]
            evaluation_result_list = [(k, float(v)) for k, v in res[1:]]
        try:
            for cb in callbacks_after_iter:
                cb(CallbackEnv(model=bst,
                               cvfolds=None,
                               iteration=i,
                               begin_iteration=start_iteration,
                               end_iteration=num_boost_round,
                               rank=rank,
                               evaluation_result_list=evaluation_result_list))
        except EarlyStopException:
            break
        # do checkpoint after evaluation, in case evaluation also updates booster.
        bst.save_rabit_checkpoint()
        version += 1

    if bst.attr('best_score') is not None:
        bst.best_score = float(bst.attr('best_score'))
        bst.best_iteration = int(bst.attr('best_iteration'))
    else:
        bst.best_iteration = nboost - 1
    bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
    return bst
Ejemplo n.º 6
0
    def start(self):
        """Start the rabit process.

        If current host is master host, initialize and start the Rabit Tracker in the background. All hosts then connect
        to the master host to set up Rabit rank.

        :return: Initialized RabitHelper, which includes helpful information such as is_master and port
        """
        self.rabit_context = None
        if self.is_master_host:
            self.logger.debug("Master host. Starting Rabit Tracker.")
            # The Rabit Tracker is a Python script that is responsible for
            # allowing each instance of rabit to find its peers and organize
            # itself in to a ring for all-reduce. It supports primitive failure
            # recovery modes.
            #
            # It runs on a master node that each of the individual Rabit instances
            # talk to.
            self.rabit_context = tracker.RabitTracker(hostIP=self.current_host,
                                                      nslave=self.n_workers,
                                                      port=self.port,
                                                      port_end=self.port + 1)

            # Useful logging to ensure that the tracker has started.
            # These are the key-value config pairs that each of the rabit slaves
            # should be initialized with. Since we have deterministically allocated
            # the master host, its port, and the number of workers, we don't need
            # to pass these out-of-band to each slave; but rely on the fact
            # that each slave will calculate the exact same config as the server.
            #
            # TODO: should probably check that these match up what we pass below.
            self.logger.info("Rabit slave environment: {}".format(self.rabit_context.slave_envs()))

            # This actually starts the RabitTracker in a background/daemon thread
            # that will automatically exit when the main process has finished.
            self.rabit_context.start(self.n_workers)

        # Start each parameter server that connects to the master.
        self.logger.debug("Starting parameter server.")

        # Rabit runs as an in-process singleton library that can be configured once.
        # Calling this multiple times will cause a seg-fault (without calling finalize).
        # We pass it the environment variables that match up with the RabitTracker
        # so that this instance can discover its peers (and recover from failure).
        #
        # First we check that the RabitTracker is up and running. Rabit actually
        # breaks (at least on Mac OS X) if the server is not running before it
        # begins to try to connect (its internal retries fail because they reuse
        # the same socket instead of creating a new one).
        #
        # if self.max_connect_attempts is None, this will loop indefinitely.
        attempt = 0
        successful_connection = False
        while (not successful_connection and
               (self.max_connect_attempts is None or attempt < self.max_connect_attempts)):
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                try:
                    self.logger.debug("Checking if RabitTracker is available.")
                    s.connect((self.master_host, self.port))
                    successful_connection = True
                    self.logger.debug("Successfully connected to RabitTracker.")
                except OSError:
                    self.logger.info("Failed to connect to RabitTracker on attempt {}".format(attempt))
                    attempt += 1
                    self.logger.info("Sleeping for {} sec before retrying".format(self.connect_retry_timeout))
                    time.sleep(self.connect_retry_timeout)

        if not successful_connection:
            self.logger.error("Failed to connect to Rabit Tracker after %s attempts", self.max_connect_attempts)
            raise Exception("Failed to connect to Rabit Tracker")
        else:
            self.logger.info("Connected to RabitTracker.")

        rabit.init(['DMLC_NUM_WORKER={}'.format(self.n_workers).encode(),
                    'DMLC_TRACKER_URI={}'.format(self.master_host).encode(),
                    'DMLC_TRACKER_PORT={}'.format(self.port).encode()])

        # We can check that the rabit instance has successfully connected to the
        # server by getting the rank of the server (e.g. its position in the ring).
        # This should be unique for each instance.
        self.logger.debug("Rabit started - Rank {}".format(rabit.get_rank()))
        self.logger.debug("Executing user code")

        # We can now run user-code. Since XGBoost runs in the same process space
        # it will use the same instance of rabit that we have configured. It has
        # a number of checks throughout the learning process to see if it is running
        # in distributed mode by calling rabit APIs. If it is it will do the
        # synchronization automatically.
        #
        # Hence we can now execute any XGBoost specific training code and it
        # will be distributed automatically.
        return RabitHelper(self.is_master_host, self.current_host, self.port)