Beispiel #1
0
 def write_checkpoint(self, checkpoint: Dict):
     # If inside a Tune Trainable, then checkpoint with Tune.
     with tune.checkpoint_dir(
             step=self._latest_checkpoint_id) as checkpoint_dir:
         source_ip = checkpoint[NODE_IP_KEY]
         source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY]
         target_ip = get_node_ip_address()
         if source_ip == target_ip:
             # Move contents of source_path, but not source_path
             # itself. shutil.move is already recursive.
             for path in Path(source_path).iterdir():
                 shutil.move(str(path.absolute()), checkpoint_dir)
             shutil.rmtree(source_path, ignore_errors=True)
         else:
             sync_dir_between_nodes(
                 source_ip=source_ip,
                 source_path=source_path,
                 target_ip=target_ip,
                 target_path=checkpoint_dir,
                 return_futures=False,
                 max_size_bytes=None,
             )
             delete_on_node(node_ip=source_ip, path=source_path)
         checkpoint_dir = Path(checkpoint_dir)
         save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)
         # add tune checkpoint id
         with open(checkpoint_dir.joinpath(TUNE_CHECKPOINT_ID), "w") as f:
             f.write(str(self._latest_checkpoint_id))
    def commit(self, path: Optional[Path] = None) -> None:
        if (self.storage_mode == CheckpointStorage.MEMORY or not path
                or not isinstance(self.dir_or_data, dict)):
            return

        source_ip = self.dir_or_data[NODE_IP_KEY]
        source_path = self.dir_or_data[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()

        if source_ip == target_ip:
            # Move contents of source_path, but not source_path
            # itself. shutil.move is already recursive.
            for inner in Path(source_path).iterdir():
                shutil.move(str(inner.absolute()), str(path))
            shutil.rmtree(source_path, ignore_errors=True)
        else:
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=str(path),
                return_futures=False,
                max_size_bytes=None,
            )
            delete_on_node(node_ip=source_ip, path=source_path)
        save_preprocessor_to_dir(self.dir_or_data.pop(PREPROCESSOR_KEY, None),
                                 path)
        # add tune checkpoint id
        with open(path.joinpath(TUNE_CHECKPOINT_ID), "w") as f:
            f.write(str(self.id))
    def _convert_directory_checkpoint_to_sync_if_needed(
            self, checkpoint: Checkpoint) -> Checkpoint:
        """Replace the directory checkpoint with a node ip & path dict checkpoint.

        This dict checkpoint will be used to sync the directory.
        If we were to use a directory checkpoint directly, it would get deepcopied &
        serialized unnecessarily."""
        with checkpoint.as_directory() as checkpoint_path:
            # Load checkpoint from path.
            checkpoint_path = Path(checkpoint_path).expanduser().absolute()
            if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists():
                # If the ID file is missing, we assume that this is already
                # a sync checkpoint
                dict_checkpoint = checkpoint.to_dict()
                if (NODE_IP_KEY not in dict_checkpoint
                        or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint):
                    raise ValueError(
                        "Wrong checkpoint format. Ensure the checkpoint is a "
                        "result of `HuggingFaceTrainer`.")
                return checkpoint
            with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f:
                tune_checkpoint_id = int(f.read())

            return Checkpoint.from_dict({
                NODE_IP_KEY:
                get_node_ip_address(),
                CHECKPOINT_PATH_ON_NODE_KEY:
                str(checkpoint_path),
                TUNE_CHECKPOINT_ID:
                tune_checkpoint_id,
            })
Beispiel #4
0
def deploy_ray_func(func, *args):  # pragma: no cover
    """
    Execute a function on an axis partition in a worker process.

    Parameters
    ----------
    func : callable
        Function to be executed on an axis partition.
    *args : iterable
        Additional arguments that need to passed in ``func``.

    Returns
    -------
    list
        The result of the function ``func`` and metadata for it.

    Notes
    -----
    Ray functions are not detected by codecov (thus pragma: no cover).
    """
    result = func(*args)
    ip = get_node_ip_address()
    if isinstance(result, pandas.DataFrame):
        return result, len(result), len(result.columns), ip
    elif all(isinstance(r, pandas.DataFrame) for r in result):
        return [i for r in result for i in [r, len(r), len(r.columns), ip]]
    else:
        return [i for r in result for i in [r, None, None, ip]]
Beispiel #5
0
    def _process_events(self, timeout: Optional[float] = None):
        with warn_if_slow("get_next_failed_trial"):
            failed_trial = self.trial_executor.get_next_failed_trial()
        if failed_trial:
            error_msg = (
                "{} (IP: {}) detected as stale. This is likely because the "
                "node was lost").format(failed_trial, failed_trial.node_ip)
            logger.info(error_msg)
            with warn_if_slow("process_failed_trial"):
                self._process_trial_failure(failed_trial, error_msg=error_msg)
        else:
            # TODO(ujvl): Consider combining get_next_available_trial and
            #  fetch_result functionality so that we don't timeout on fetch.
            trial = self.trial_executor.get_next_available_trial(
                timeout=timeout)  # blocking
            if not trial:
                return
            if trial.is_restoring:
                with warn_if_slow("process_trial_restore"):
                    self._process_trial_restore(trial)
                with warn_if_slow("callbacks.on_trial_restore"):
                    self._callbacks.on_trial_restore(iteration=self._iteration,
                                                     trials=self._trials,
                                                     trial=trial)
            elif trial.is_saving:
                with warn_if_slow("process_trial_save") as _profile:
                    self._process_trial_save(trial)
                with warn_if_slow("callbacks.on_trial_save"):
                    self._callbacks.on_trial_save(iteration=self._iteration,
                                                  trials=self._trials,
                                                  trial=trial)
                if _profile.too_slow and trial.sync_on_checkpoint:
                    # TODO(ujvl): Suggest using DurableTrainable once
                    #  API has converged.

                    msg = (
                        "Consider turning off forced head-worker trial "
                        "checkpoint syncs by setting sync_on_checkpoint=False"
                        ". Note that this may result in faulty trial "
                        "restoration if a failure occurs while the checkpoint "
                        "is being synced from the worker to the head node.")

                    if trial.location.hostname and (trial.location.hostname !=
                                                    get_node_ip_address()):
                        if log_once("tune_head_worker_checkpoint"):
                            logger.warning(msg)

            else:
                with warn_if_slow("process_trial"):
                    self._process_trial(trial)

            # `self._queued_trial_decisions` now contains a final decision
            # based on all results
            if trial not in self._cached_trial_decisions:
                final_decision = self._queued_trial_decisions.pop(
                    trial.trial_id, None)
                if final_decision:
                    self._execute_action(trial, final_decision)
Beispiel #6
0
def apply_list_of_funcs(funcs, partition):  # pragma: no cover
    """
    Execute all operations stored in the call queue on the partition in a worker process.

    Parameters
    ----------
    funcs : list
        A call queue that needs to be executed on the partition.
    partition : pandas.DataFrame
        A pandas DataFrame the call queue needs to be executed on.

    Returns
    -------
    pandas.DataFrame
        The resulting pandas DataFrame.
    int
        The number of rows of the resulting pandas DataFrame.
    int
        The number of columns of the resulting pandas DataFrame.
    str
        The node IP address of the worker process.
    """

    def deserialize(obj):
        if isinstance(obj, ObjectIDType):
            return ray.get(obj)
        elif isinstance(obj, (tuple, list)) and any(
            isinstance(o, ObjectIDType) for o in obj
        ):
            return ray.get(list(obj))
        elif isinstance(obj, dict) and any(
            isinstance(val, ObjectIDType) for val in obj.values()
        ):
            return dict(zip(obj.keys(), ray.get(list(obj.values()))))
        else:
            return obj

    for func, args, kwargs in funcs:
        func = deserialize(func)
        args = deserialize(args)
        kwargs = deserialize(kwargs)
        try:
            partition = func(partition, *args, **kwargs)
        # Sometimes Arrow forces us to make a copy of an object before we operate on it. We
        # don't want the error to propagate to the user, and we want to avoid copying unless
        # we absolutely have to.
        except ValueError:
            partition = func(partition.copy(), *args, **kwargs)

    return (
        partition,
        len(partition) if hasattr(partition, "__len__") else 0,
        len(partition.columns) if hasattr(partition, "columns") else 0,
        get_node_ip_address(),
    )
Beispiel #7
0
 def on_save(self, args, state, control, **kwargs):
     # Save is called after evaluation.
     checkpoint_path = Path(
         transformers.trainer.get_last_checkpoint(
             args.output_dir)).absolute()
     if checkpoint_path:
         train.save_checkpoint(
             **{
                 NODE_IP_KEY: get_node_ip_address(),
                 CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path),
             })
Beispiel #8
0
    def __init__(self,
                 local_dir: str,
                 remote_dir: str,
                 sync_client: Optional[SyncClient] = None):
        configure_logging(log_style="record",
                          verbosity=env_integer("TUNE_SYNCER_VERBOSITY", 0))
        self.local_ip = get_node_ip_address()
        self.worker_ip = None

        sync_client = sync_client or DockerSyncClient()
        sync_client.configure(self._cluster_config_file)

        super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
Beispiel #9
0
    def __init__(self,
                 local_dir: str,
                 remote_dir: str,
                 sync_client: Optional[SyncClient] = None):
        if not kubernetes:
            raise ImportError(
                "kubernetes is not installed on this machine/container. "
                "Try: pip install kubernetes")
        self.local_ip = get_node_ip_address()
        self.local_node = self._get_kubernetes_node_by_ip(self.local_ip)
        self.worker_ip = None
        self.worker_node = None

        sync_client = sync_client or KubernetesSyncClient(
            namespace=self.__class__._namespace)

        super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
Beispiel #10
0
    def after_iteration(self, model, epoch, evals_log):
        if self._allow_ips and get_node_ip_address() not in self._allow_ips:
            return

        if epoch == self._iteration:
            rank = get_actor_rank()
            if rank in self._ranks:
                if not ray.get(self._state.has_failed.remote(self._id)):
                    success = ray.get(self._state.set_failed.remote(self._id))
                    if not success:
                        # Another rank is already about to fail
                        return

                    pid = os.getpid()
                    print(f"Killing process: {pid} for actor rank {rank}")
                    time.sleep(1)
                    os.kill(pid, 9)
Beispiel #11
0
def apply_func(partition, func, *args, **kwargs):  # pragma: no cover
    """
    Execute a function on the partition in a worker process.

    Parameters
    ----------
    partition : pandas.DataFrame
        A pandas DataFrame the function needs to be executed on.
    func : callable
        Function that needs to be executed on the partition.
    *args : iterable
        Additional positional arguments to be passed in `func`.
    **kwargs : dict
        Additional keyword arguments to be passed in `func`.

    Returns
    -------
    pandas.DataFrame
        The resulting pandas DataFrame.
    int
        The number of rows of the resulting pandas DataFrame.
    int
        The number of columns of the resulting pandas DataFrame.
    str
        The node IP address of the worker process.
    """
    try:
        result = func(partition, *args, **kwargs)
    # Sometimes Arrow forces us to make a copy of an object before we operate on it. We
    # don't want the error to propagate to the user, and we want to avoid copying unless
    # we absolutely have to.
    except ValueError:
        result = func(partition.copy(), *args, **kwargs)
    return (
        result,
        len(result) if hasattr(result, "__len__") else 0,
        len(result.columns) if hasattr(result, "columns") else 0,
        get_node_ip_address(),
    )
def _huggingface_train_loop_per_worker(config):
    """Per-worker training loop for HuggingFace Transformers."""
    trainer_init_per_worker = config.pop("_trainer_init_per_worker")

    # Env vars necessary for HF to setup DDP
    os.environ["RANK"] = str(train.world_rank())
    os.environ["WORLD_SIZE"] = str(train.world_size())
    os.environ["LOCAL_RANK"] = str(train.local_rank())

    train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY)
    eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY)

    train_torch_dataset, eval_torch_dataset = process_datasets(
        train_dataset,
        eval_dataset,
    )

    trainer: transformers.trainer.Trainer = trainer_init_per_worker(
        train_torch_dataset, eval_torch_dataset, **config)

    if trainer.args.push_to_hub and not trainer.args.hub_token:
        warnings.warn(
            "You have set `push_to_hub=True` but didn't specify `hub_token`. "
            "Pushing to hub will most likely fail, as the credentials will not "
            "be automatically propagated from the local enviroment to the Ray Actors. "
            "If that happens, specify `hub_token` in `TrainingArguments`.")

    if (trainer.args.evaluation_strategy == "steps"
            or trainer.args.save_strategy == "steps"
            or trainer.args.logging_strategy == "steps"):
        raise ValueError(
            "'steps' value for `evaluation_strategy`, `logging_strategy` "
            "or `save_strategy` is not yet supported.")

    trainer = wrap_transformers_trainer(trainer)

    # ensure no HF logging callbacks are added
    # aside from doubling functionality with our callbacks,
    # the Wandb callbacks causes training to freeze
    integration_callbacks = transformers.trainer.get_reporting_integration_callbacks(
        trainer.args.report_to)
    for callback in integration_callbacks:
        trainer.pop_callback(callback)

    trainer.add_callback(TrainReportCallback)

    checkpoint = session.get_checkpoint()
    checkpoint_path = None
    remove_checkpoint_path = False
    if checkpoint:
        assert isinstance(checkpoint, Checkpoint)
        checkpoint_dict = checkpoint.to_dict()
        source_ip = checkpoint_dict[NODE_IP_KEY]
        source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()
        if source_ip == target_ip:
            checkpoint_path = source_path
        else:
            checkpoint_path = tempfile.mkdtemp(
                suffix=Path(trainer.args.output_dir).name)
            remove_checkpoint_path = True
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=checkpoint_path,
                return_futures=False,
                max_size_bytes=None,
            )
    trainer.train(resume_from_checkpoint=checkpoint_path)
    if remove_checkpoint_path:
        shutil.rmtree(checkpoint_path, ignore_errors=True)
Beispiel #13
0
 def before_training(self, model):
     ip_address = get_node_ip_address()
     put_queue(ip_address)
     return model
Beispiel #14
0
 def __init__(self, local_dir, remote_dir, sync_client):
     self.local_ip = get_node_ip_address()
     self.worker_ip = None
     super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
Beispiel #15
0
def _train(
        dtrain,
        params: Dict,
        *args,
        num_actors=None,
        evals=(),
        **kwargs,
):
    """
    Run distributed training of XGBoost model on Ray engine.

    During work it evenly distributes `dtrain` between workers according
    to IP addresses partitions (in case of not even distribution of `dtrain`
    by nodes, part of partitions will be re-distributed between nodes),
    runs xgb.train on each worker for subset of `dtrain` and reduces training results
    of each worker using Rabit Context.

    Parameters
    ----------
    dtrain : modin.experimental.DMatrix
        Data to be trained against.
    params : dict
        Booster params.
    *args : iterable
        Other parameters for `xgboost.train`.
    num_actors : int, optional
        Number of actors for training. If unspecified, this value will be
        computed automatically.
    evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty
        List of validation sets for which metrics will be evaluated during training.
        Validation metrics will help us track the performance of the model.
    **kwargs : dict
        Other parameters are the same as `xgboost.train`.

    Returns
    -------
    dict
        A dictionary with trained booster and dict of
        evaluation results
        as {"booster": xgboost.Booster, "history": dict}.
    """
    s = time.time()

    X_row_parts, y_row_parts = dtrain
    dmatrix_kwargs = dtrain.get_dmatrix_params()

    assert len(X_row_parts) == len(y_row_parts), "Unaligned train data"

    num_actors = _get_num_actors(num_actors)

    if num_actors > len(X_row_parts):
        num_actors = len(X_row_parts)

    if evals:
        min_num_parts = num_actors
        for (eval_X, _), eval_method in evals:
            if len(eval_X) < min_num_parts:
                min_num_parts = len(eval_X)
                method_name = eval_method

        if num_actors != min_num_parts:
            num_actors = min_num_parts
            warnings.warn(
                f"`num_actors` is set to {num_actors}, because `evals` data with name `{method_name}` has only {num_actors} partition(s)."
            )

    actors = create_actors(num_actors)

    add_as_eval_method = None
    if evals:
        for (eval_data, method) in evals[:]:
            if eval_data is dtrain:
                add_as_eval_method = method
                evals.remove((eval_data, method))

        for ((eval_X, eval_y), eval_method) in evals:
            # Split data across workers
            _split_data_across_actors(
                actors,
                lambda actor, *X_y: actor.add_eval_data.remote(
                    *X_y, eval_method=eval_method, **dmatrix_kwargs),
                eval_X,
                eval_y,
            )

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X_y: actor.set_train_data.remote(
            *X_y, add_as_eval_method=add_as_eval_method, **dmatrix_kwargs),
        X_row_parts,
        y_row_parts,
    )
    LOGGER.info(f"Data preparation time: {time.time() - s} s")

    s = time.time()
    with RabitContextManager(len(actors), get_node_ip_address()) as env:
        rabit_args = [("%s=%s" % item).encode() for item in env.items()]

        # Train
        fut = [
            actor.train.remote(rabit_args, params, *args, **kwargs)
            for _, actor in actors
        ]
        # All results should be the same because of Rabit tracking. So we just
        # return the first one.
        result = ray.get(fut[0])
        LOGGER.info(f"Training time: {time.time() - s} s")
        return result