Esempio n. 1
0
    def __init__(self,
                 backend_config: BackendConfig,
                 num_workers: int = 1,
                 num_cpus_per_worker: float = 1,
                 num_gpus_per_worker: float = 0,
                 additional_resources_per_worker: Optional[Dict[str,
                                                                float]] = None,
                 max_retries: int = 3):
        self._backend_config = backend_config
        self._backend = self._backend_config.backend_cls()
        self._num_workers = num_workers
        self._num_cpus_per_worker = num_cpus_per_worker
        self._num_gpus_per_worker = num_gpus_per_worker
        self._additional_resources_per_worker = additional_resources_per_worker
        self._max_failures = max_retries
        if self._max_failures < 0:
            self._max_failures = float("inf")
        self._num_failures = 0
        self._initialization_hook = None

        if tune is not None and tune.is_session_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()

        self.worker_group = InactiveWorkerGroup()
        self.dataset_shards = None

        self.checkpoint_manager.on_init()
Esempio n. 2
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0

        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker.")
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker.")

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        self._backend_executor_actor = remote_executor.remote(
            backend_config=self._backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()
Esempio n. 3
0
class Trainer:
    """A class for enabling seamless distributed deep learning.

    Directory structure:
    - A logdir is created during instantiation. This will hold all the
    results/checkpoints for the lifetime of the Trainer. By default, it will be
    of the form ``~/ray_results/train_<datestring>``.
    - A run_dir is created for each ``run`` call. This will
    hold the checkpoints and results for a single ``trainer.run()`` or
    ``trainer.run_iterator()`` call. It will be of the form ``run_<run_id>``.

    Args:
        backend (Union[str, BackendConfig]): The backend used for
            distributed communication. If configurations are needed,
            a subclass of ``BackendConfig`` can be passed in.
            Supported ``str`` values: {"torch", "tensorflow", "horovod"}.
        num_workers (int): The number of workers (Ray actors) to launch.
            Each worker will reserve 1 CPU by default. The number of CPUs
            reserved by each worker can be overridden with the
            ``resources_per_worker`` argument.
        use_gpu (bool): If True, training will be done on GPUs (1 per
            worker). Defaults to False. The number of GPUs reserved by each
            worker can be overridden with the ``resources_per_worker``
            argument.
        resources_per_worker (Optional[Dict]): If specified, the resources
            defined in this Dict will be reserved for each worker. The
            ``CPU`` and ``GPU`` keys (case-sensitive) can be defined to
            override the number of CPU/GPUs used by each worker.
        logdir (Optional[str]): Path to the file directory where logs
            should be persisted. If this is not specified, one will be
            generated.
         max_retries (int): Number of retries when Ray actors fail.
            Defaults to 3. Set to -1 for unlimited retries.
    """

    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        if not ray.is_initialized():
            ray.init()

        if "GPU" in ray.available_resources() and not use_gpu:
            logger.info(
                "GPUs are detected in your Ray cluster, but GPU "
                "training is not enabled for Ray Train. To enable "
                "GPU training, make sure to set `use_gpu` to True "
                "when instantiating your Trainer."
            )

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0
        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker."
                )
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker."
                )

        runtime_env = {
            "env_vars": {
                var_name: os.environ[var_name]
                for var_name in BACKEND_ENV_VARS
                if var_name in os.environ
            }
        }

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        backend_executor_actor = remote_executor.options(
            runtime_env=runtime_env
        ).remote(
            backend_config=self._backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries,
        )

        self._backend_executor = ActorWrapper(backend_executor_actor)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()

    def create_logdir(self, log_dir: Optional[Union[str, Path]]) -> Path:
        """Create logdir for the Trainer."""
        # Create directory for logs.
        log_dir = Path(log_dir) if log_dir else None
        if not log_dir:
            # Initialize timestamp for identifying this Train  execution.
            timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
            log_dir = Path(f"train_{timestr}")
        log_dir = construct_path(log_dir, DEFAULT_RESULTS_DIR)
        log_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Trainer logs will be logged in: {log_dir}")
        return log_dir

    def create_run_dir(self):
        """Create rundir for the particular training run."""
        self.latest_run_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Run results will be logged in: {self.latest_run_dir}")

    def _get_backend_config(self, backend: Union[str, BackendConfig]) -> BackendConfig:
        """Gets the ``BackendConfig`` to use for training.

        Args:
            backend (Union[str, BackendConfig]): If a ``BackendConfig`` is
                passed in, then it will also be returned. If a ``str`` is
                passed in, then the default config for that backend will be
                returned.

        Returns:
            The ``BackendConfig`` that will be used to set up the
            ``BackendExecutor``.
        """

        if isinstance(backend, BackendConfig):
            return backend
        elif isinstance(backend, str):
            return get_backend_config_cls(backend)()
        else:
            raise TypeError(f"Invalid type for backend: {type(backend)}.")

    def _is_tune_enabled(self):
        """Whether or not this Trainer is part of a Tune session."""
        return TUNE_INSTALLED and tune.is_session_enabled()

    def start(self, initialization_hook: Optional[Callable[[], None]] = None):
        """Starts the training execution service.

        Args:
            initialization_hook (Optional[Callable]): The function to call on
                each worker when it is instantiated.
        """
        self._backend_executor.start(initialization_hook)

    def run(
        self,
        train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
        config: Optional[Dict[str, Any]] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
        checkpoint: Optional[Union[Dict, str, Path]] = None,
        checkpoint_strategy: Optional[CheckpointStrategy] = None,
    ) -> List[T]:
        """Runs a training function in a distributed manner.

        Args:
            train_func (Callable): The training function to execute.
                This can either take in no arguments or a ``config`` dict.
            config (Optional[Dict]): Configurations to pass into
                ``train_func``. If None then an empty Dict will be created.
            callbacks (Optional[List[TrainingCallback]]): A list of Callbacks
                which will be executed during training. If this is not set,
                currently there are NO default Callbacks.
            dataset (Optional[Union[RayDataset, Dict[str, RayDataset]]]):
                Distributed Ray :ref:`Dataset <dataset-api>` or
                :ref:`DatasetPipeline <dataset-pipeline-api>` to pass into the
                workers, which can be accessed from the training function via
                ``train.get_dataset_shard()``. Sharding will automatically be
                handled by the Trainer. Multiple Datasets can be passed in as
                a ``Dict`` that maps each name key to a Dataset value,
                and each Dataset can be accessed from the training function
                by passing in a `dataset_name` argument to
                ``train.get_dataset_shard()``.
            checkpoint (Optional[Dict|str|Path]): The checkpoint data that
                should be loaded onto each worker and accessed by the training
                function via ``train.load_checkpoint()``. If this is a ``str``
                or ``Path`` then the value is expected to be a path to a file
                that contains a serialized checkpoint dict. If this is
                ``None`` then no checkpoint will be loaded.
            checkpoint_strategy (Optional[CheckpointStrategy]): The
                configurations for saving checkpoints.

        Returns:
            A list of results from the training function. Each value in the
            list corresponds to the output of the training function from
            each worker.
        """
        # Create new log directory for this run.
        self._run_id += 1
        self.create_run_dir()

        # TODO(matt): Set default callbacks.
        callbacks = [] if callbacks is None else callbacks
        finished_with_errors = False

        for callback in callbacks:
            callback.start_training(
                logdir=str(self.latest_run_dir), config=config or {}
            )

        train_func = construct_train_func(train_func, config)

        try:
            iterator = TrainingIterator(
                backend_executor=self._backend_executor,
                backend_config=self._backend_config,
                train_func=train_func,
                dataset=dataset,
                checkpoint_manager=self.checkpoint_manager,
                checkpoint=checkpoint,
                checkpoint_strategy=checkpoint_strategy,
                run_dir=self.latest_run_dir,
            )
            for intermediate_result in iterator:
                for callback in callbacks:
                    callback.process_results(intermediate_result)

            assert iterator.is_finished()
            return iterator.get_final_results()
        finally:
            for callback in callbacks:
                callback.finish_training(error=finished_with_errors)

    def run_iterator(
        self,
        train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
        config: Optional[Dict[str, Any]] = None,
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
        checkpoint: Optional[Union[Dict, str, Path]] = None,
        checkpoint_strategy: Optional[CheckpointStrategy] = None,
    ) -> "TrainingIterator":
        """Same as ``run`` except returns an iterator over the results.

        This is useful if you want to have more customization of what to do
        with the intermediate results or how to use the ``Trainer`` with Ray
        Tune.

        .. code-block:: python

            def train_func(config):
                ...
                for _ in config["epochs"]:
                    metrics = train()
                    metrics = validate(...)
                    ray.train.report(**metrics)
                return model

            iterator = trainer.run_iterator(train_func, config=config)

            for result in iterator:
                do_stuff(result)
                latest_ckpt = trainer.get_latest_checkpoint()

            assert iterator.is_finished()
            model = iterator.get_fin()[0]

        Args:
            train_func (Callable): The training function to execute.
                This can either take in no arguments or a ``config`` dict.
            config (Optional[Dict]): Configurations to pass into
                ``train_func``. If None then an empty Dict will be created.
            checkpoint (Optional[Dict|Path|str]): The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``train.load_checkpoint()``. If this is a
                ``str`` or ``Path`` then the value is expected to be a path
                to a file that contains a serialized checkpoint dict. If this
                is ``None`` then no checkpoint will be loaded.
            checkpoint_strategy (Optional[CheckpointStrategy]): The
                configurations for saving checkpoints.

        Returns:
            An Iterator over the intermediate results from ``train.report()``.
        """
        # Create new log directory for this run.
        self._run_id += 1
        self.create_run_dir()

        train_func = construct_train_func(train_func, config)

        return TrainingIterator(
            backend_executor=self._backend_executor,
            backend_config=self._backend_config,
            train_func=train_func,
            run_dir=self.latest_run_dir,
            dataset=dataset,
            checkpoint_manager=self.checkpoint_manager,
            checkpoint=checkpoint,
            checkpoint_strategy=checkpoint_strategy,
        )

    @property
    def latest_run_dir(self) -> Optional[Path]:
        """Path to the log directory for the latest call to ``run()``.

        Returns ``None`` if ``run()`` has not been called.
        """
        if self._run_id > 0:
            run_dir = Path(f"run_{self._run_id:03d}")
            return construct_path(run_dir, self.logdir)
        else:
            return None

    @property
    def latest_checkpoint_dir(self) -> Optional[Path]:
        """Path to the checkpoint directory.

        Returns ``None`` if ``run()`` has not been called or if
        ``train.checkpoint()`` has not been called from ``train_func``within
        the most recent call to ``run``.
        """
        return self.checkpoint_manager.latest_checkpoint_dir

    @property
    def best_checkpoint_path(self) -> Optional[Path]:
        """Path to the best persisted checkpoint from the latest run.

        "Best" is defined by the input ``CheckpointStrategy``.
        Default behavior is to return the most recent checkpoint.

        Returns ``None`` if ``run()`` has not been called or if
        ``train.save_checkpoint()`` has not been called from ``train_func``
        within the most recent call to ``run``.
        """
        return self.checkpoint_manager.best_checkpoint_path

    @property
    def latest_checkpoint(self) -> Optional[Dict]:
        """The latest saved checkpoint.

        This checkpoint may not be saved to disk.

        Returns ``None`` if ``run()`` has not been called or if
        ``train.checkpoint()`` has not been called from ``train_func``.
        """
        return self.checkpoint_manager.latest_checkpoint

    @property
    def best_checkpoint(self) -> Optional[Dict]:
        """Best saved checkpoint from the latest run.

        "Best" is defined by the input ``CheckpointStrategy``.
        Default behavior is to return the most recent checkpoint.

        Returns ``None`` if ``run()`` has not been called or if
        ``train.save_checkpoint()`` has not been called from ``train_func``
        within the most recent call to ``run``.
        """
        best_checkpoint_path = self.best_checkpoint_path
        if best_checkpoint_path is None:
            return None
        else:
            return load_checkpoint_from_path(best_checkpoint_path)

    @staticmethod
    def load_checkpoint_from_path(checkpoint_file_path: Union[str, Path]) -> Dict:
        """Convenience method to load a checkpoint from path.

        An error will be raised if the provided path does not exist.

        Args:
            checkpoint_file_path (Union[str, Path]): The path to the checkpoint
                to load. If the checkpoint saved in this path has not been
                created by Ray Train, there is no guarantee that it can be
                loaded in successfully.
        """
        return load_checkpoint_from_path(checkpoint_file_path)

    def shutdown(self):
        """Shuts down the training execution service."""
        self._backend_executor.shutdown()

    def to_tune_trainable(
        self,
        train_func: Callable[[Dict[str, Any]], T],
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
    ) -> Type[Trainable]:
        """Creates a Tune ``Trainable`` from the input training function.

        Args:
            func (Callable): The function that should be executed on each
                training worker.
            dataset (Optional[Union[RayDataset, Dict[str, RayDataset]]]):
                Distributed Ray p:ref:`Dataset <dataset-api>` or
                :ref:`DatasetPipeline <dataset-pipeline-api>` to pass into the
                workers, which can be accessed from the training function via
                ``train.get_dataset_shard()``. Sharding will automatically be
                handled by the Trainer. Multiple Datasets can be passed in as
                a ``Dict`` that maps each name key to a Dataset value,
                and each Dataset can be accessed from the training function
                by passing in a `dataset_name` argument to
                ``train.get_dataset_shard()``.

        Returns:
            A Trainable that can directly be passed into ``tune.run()``.
        """
        if not TUNE_INSTALLED:
            raise ValueError(
                "Tune is not installed. Please install ray["
                "tune] to use the Tune integration."
            )

        if self._backend_executor.is_started():
            raise RuntimeError(
                "The Trainer must not be active to use "
                "`to_tune_trainable`. Either shutdown the "
                "Trainer or don't start it in the first place."
            )

        return _create_tune_trainable(
            train_func,
            dataset,
            self._backend_config,
            self._num_workers,
            self._use_gpu,
            self._resources_per_worker,
        )

    def to_worker_group(self, train_cls: Type, *args, **kwargs) -> "TrainWorkerGroup":
        """Returns Ray actors with the provided class and the backend started.

        This is useful if you want to provide your own class for training
        and have more control over execution, but still want to use Ray Train
        to setup the appropriate backend configurations (torch, tf, etc.).

        .. code-block:: python

            class Trainer:
                def __init__(self, config):
                    self.config = config

                def train_epoch(self):
                    ...
                    return 1

            config = {"lr": 0.1}
            trainer = Trainer(num_workers=2, backend="torch")
            workers = trainer.to_worker_group(train_cls=Trainer, config=config)
            futures = [w.train_epoch.remote() for w in workers]
            assert ray.get(futures) == [1, 1]
            assert ray.get(workers[0].train_epoch.remote()) == 1
            workers.shutdown()

        Args:
            train_cls (Type): The class definition to use for the Ray
                actors/workers.
            args, kwargs: Arguments to pass into the ``__init__`` of the
                provided ``train_cls``.
        """
        if self._backend_executor.is_started():
            raise RuntimeError(
                "The Trainer must not be active to use "
                "`to_worker_group`. Either shutdown the "
                "Trainer or don't start it in the first place."
            )
        self._backend_executor.start(
            train_cls=train_cls, train_cls_args=args, train_cls_kwargs=kwargs
        )
        worker_group = self._backend_executor.get_worker_group()
        return TrainWorkerGroup(worker_group)
Esempio n. 4
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        if not ray.is_initialized():
            ray.init()

        if "GPU" in ray.available_resources() and not use_gpu:
            logger.info(
                "GPUs are detected in your Ray cluster, but GPU "
                "training is not enabled for Ray Train. To enable "
                "GPU training, make sure to set `use_gpu` to True "
                "when instantiating your Trainer."
            )

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0
        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker."
                )
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker."
                )

        runtime_env = {
            "env_vars": {
                var_name: os.environ[var_name]
                for var_name in BACKEND_ENV_VARS
                if var_name in os.environ
            }
        }

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        backend_executor_actor = remote_executor.options(
            runtime_env=runtime_env
        ).remote(
            backend_config=self._backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries,
        )

        self._backend_executor = ActorWrapper(backend_executor_actor)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()
Esempio n. 5
0
class BackendExecutor:
    """Main execution class for training backends.

    This class holds a worker group and is responsible for executing the
    training function on the workers, and collecting intermediate results
    from ``train.report()`` and ``train.checkpoint()``.

    Args:
        backend_config (BackendConfig): The configurations for this
            specific backend.
        num_workers (int): Number of workers to use for training.
        num_cpus_per_worker (float): Number of CPUs to use per worker.
        num_gpus_per_worker (float): Number of GPUs to use per worker.
        additional_resources_per_worker (Optional[Dict[str, float]]):
            Dictionary specifying the extra resources that will be
            requested for each worker in addition to ``num_cpus_per_worker``
            and ``num_gpus_per_worker``.
        max_retries (int): Number of retries when Ray actors fail.
            Defaults to 3. Set to -1 for unlimited retries.

    Attributes:
        latest_checkpoint_dir (Optional[Path]): Path to the file directory for
            the checkpoints from the latest run. Configured through
            ``start_training``
        best_checkpoint_path (Optional[Path]): Path to the best persisted
            checkpoint from the latest run.
        latest_checkpoint (Optional[Dict]): The latest saved checkpoint. This
            checkpoint may not be saved to disk.
    """
    def __init__(self,
                 backend_config: BackendConfig,
                 num_workers: int = 1,
                 num_cpus_per_worker: float = 1,
                 num_gpus_per_worker: float = 0,
                 additional_resources_per_worker: Optional[Dict[str,
                                                                float]] = None,
                 max_retries: int = 3):
        self._backend_config = backend_config
        self._backend = self._backend_config.backend_cls()
        self._num_workers = num_workers
        self._num_cpus_per_worker = num_cpus_per_worker
        self._num_gpus_per_worker = num_gpus_per_worker
        self._additional_resources_per_worker = additional_resources_per_worker
        self._max_failures = max_retries
        if self._max_failures < 0:
            self._max_failures = float("inf")
        self._num_failures = 0
        self._initialization_hook = None

        if tune is not None and tune.is_session_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()

        self.worker_group = InactiveWorkerGroup()
        self.dataset_shards = None

        self.checkpoint_manager.on_init()

    def start(self,
              initialization_hook: Optional[Callable[[], None]] = None,
              train_cls: Optional[Type] = None,
              train_cls_args: Optional[Tuple] = None,
              train_cls_kwargs: Optional[Dict] = None):
        """Starts the worker group."""
        self.worker_group = WorkerGroup(
            num_workers=self._num_workers,
            num_cpus_per_worker=self._num_cpus_per_worker,
            num_gpus_per_worker=self._num_gpus_per_worker,
            additional_resources_per_worker=self.
            _additional_resources_per_worker,
            actor_cls=train_cls,
            actor_cls_args=train_cls_args,
            actor_cls_kwargs=train_cls_kwargs)
        try:
            if initialization_hook:
                self._initialization_hook = initialization_hook
                self.worker_group.execute(initialization_hook)

            share_cuda_visible_devices_enabled = bool(
                env_integer(ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
                            self._backend.share_cuda_visible_devices))

            if (self._num_gpus_per_worker > 0
                    and share_cuda_visible_devices_enabled):
                self._share_cuda_visible_devices()
            self._backend.on_start(self.worker_group, self._backend_config)
        except RayActorError as exc:
            logger.exception(str(exc))
            self._increment_failures()
            self._restart()

    def _share_cuda_visible_devices(self):
        """Sets CUDA_VISIBLE_DEVICES on all workers.

        For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs
        visible to all workers on that worker's node.

        This allows GPU workers on the same node to communicate with one
        another.

        Example:

            Setup:
            - Node1:
                - Worker1: {0, 1}
                - Worker2: {2, 3}
            - Node2:
                - Worker3: {0, 1}

            CUDA_VISIBLE_DEVICES:
            - Worker1: "0,1,2,3"
            - Worker2: "0,1,2,3"
            - Worker2: "0,1"

        """

        node_ids_and_gpu_ids = [(w.metadata.node_id, w.metadata.gpu_ids)
                                for w in self.worker_group.workers]

        node_id_to_worker_id = defaultdict(set)
        node_id_to_gpu_ids = defaultdict(set)

        for worker_id, (node_id, gpu_ids) in enumerate(node_ids_and_gpu_ids):
            node_id_to_worker_id[node_id].add(worker_id)
            node_id_to_gpu_ids[node_id].update(gpu_ids)

        futures = []
        for node_id, gpu_ids in node_id_to_gpu_ids.items():
            all_gpu_ids = ",".join([str(gpu_id) for gpu_id in gpu_ids])

            def set_gpu_ids():
                os.environ["CUDA_VISIBLE_DEVICES"] = all_gpu_ids

            for worker_id in node_id_to_worker_id[node_id]:
                futures.append(
                    self.worker_group.execute_single_async(
                        worker_id, set_gpu_ids))
        ray.get(futures)

    def _create_local_rank_map(self) -> Dict:
        """Create mapping from worker world_rank to local_rank.

        Example:
            Worker 0: 0.0.0.0
            Worker 1: 0.0.0.0
            Worker 2: 0.0.0.1
            Worker 3: 0.0.0.0
            Worker 4: 0.0.0.1

            Workers 0, 1, 3 are on 0.0.0.0.
            Workers 2, 4 are on 0.0.0.1.

            Expected Output:
            {
                0 -> 0,
                1 -> 1,
                2 -> 0,
                3 -> 2,
                4 -> 1
            }
        """
        rank_mapping = {}
        ip_dict = defaultdict(int)
        for world_rank in range(len(self.worker_group)):
            worker = self.worker_group.workers[world_rank]
            node_ip = worker.metadata.node_ip
            rank_mapping[world_rank] = ip_dict[node_ip]
            ip_dict[node_ip] += 1
        return rank_mapping

    def _get_dataset_shards(self, dataset_or_dict):

        if dataset_or_dict is None:
            # Return None for each shard.
            return [None] * len(self.worker_group)

        def split_dataset(dataset_or_pipeline):
            actors = [worker.actor for worker in self.worker_group.workers]
            return dataset_or_pipeline.split(len(self.worker_group),
                                             equal=True,
                                             locality_hints=actors)

        if isinstance(dataset_or_dict, dict):
            # Return a smaller dict for each shard.
            dataset_shards = [{} for _ in range(len(self.worker_group))]
            for key, dataset in dataset_or_dict.items():
                split_datasets = split_dataset(dataset)
                assert len(split_datasets) == len(self.worker_group)
                for i in range(len(split_datasets)):
                    dataset_shards[i][key] = split_datasets[i]
            return dataset_shards
        else:
            # return a smaller RayDataset for each shard.
            return split_dataset(dataset_or_dict)

    def start_training(
        self,
        train_func: Callable[[], T],
        run_dir: Path,
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
        checkpoint: Optional[Union[Dict, str, Path]] = None,
        checkpoint_strategy: Optional[CheckpointStrategy] = None,
        latest_checkpoint_id: Optional[int] = None,
    ) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func (Callable): The training function to run on each worker.
            run_dir (Path): The directory to use for this run.
            dataset (Optional[Union[Dataset, DatasetPipeline]])
                Distributed Ray Dataset or DatasetPipeline to pass into
                worker, which can be accessed from the training function via
                ``train.get_dataset_shard()``. Sharding will automatically be
                handled by the Trainer. Multiple Datasets can be passed in as
                a ``Dict`` that maps each name key to a Dataset value,
                and each Dataset can be accessed from the training function
                by passing in a `dataset_name` argument to
                ``train.get_dataset_shard()``.
            checkpoint (Optional[Dict|str|Path]): The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``train.load_checkpoint()``. If this is a
                ``str`` or ``Path`` then the value is expected to be a path
                to a file that contains a serialized checkpoint dict. If this
                is ``None`` then no checkpoint will be loaded.
            checkpoint_strategy (Optional[CheckpointStrategy]): The
                configurations for saving checkpoints.
            latest_checkpoint_id (Optional[int]): The checkpoint id of the
                most recently saved checkpoint.
        """
        self.checkpoint_manager.on_start_training(
            checkpoint_strategy=checkpoint_strategy,
            run_dir=run_dir,
            latest_checkpoint_id=latest_checkpoint_id)

        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(train_func, world_rank, local_rank, checkpoint,
                               dataset_shard):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    dataset_shard=dataset_shard,
                    checkpoint=checkpoint,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics
                )
            except ValueError:
                raise TrainBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        if self.dataset_shards is None:
            self.dataset_shards = self._get_dataset_shards(dataset)

        checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint)

        local_rank_map = self._create_local_rank_map()

        futures = []
        for index in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(
                    index,
                    initialize_session,
                    world_rank=index,
                    local_rank=local_rank_map[index],
                    train_func=train_func,
                    dataset_shard=self.dataset_shards[index],
                    checkpoint=checkpoint_dict))

        self.get_with_failure_handling(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)

    def _get_next_results(self) -> Optional[List[TrainingResult]]:
        """Fetches the next ``TrainingResult`` from each worker.

        Each ``TrainingResult`` is expected to correspond to the same step from
        each worker (e.g. the same call to ``train.report()`` or
        ``train.checkpoint()``).

        Returns:
            A list of ``TrainingResult``s with the same
            ``TrainingResultType``, or ``None`` if there are no more results.
        """
        def get_next():
            # Get the session for this worker.
            try:
                session = get_session()
            except ValueError:
                # Session is not initialized yet.
                raise TrainBackendError("`fetch_next_result` has been called "
                                        "before `start_training`. Please call "
                                        "`start_training` before "
                                        "`fetch_next_result`.")

            try:
                result = session.get_next()
            except RuntimeError:
                # Training thread has not been started yet.
                raise TrainBackendError("`fetch_next_result` has been called "
                                        "before `start_training`. Please call "
                                        "`start_training` before "
                                        "`fetch_next_result`.")

            return result

        # Get next result from each worker.
        futures = self.worker_group.execute_async(get_next)
        results = self.get_with_failure_handling(futures)

        # Check if any worker returned None.
        if any(r is None for r in results):
            # Either all workers have results or none of them do.
            if not all(r is None for r in results):
                raise RuntimeError(
                    "Some workers returned results while "
                    "others didn't. Make sure that "
                    "`train.report()` and `train.checkpoint()` "
                    "are called the same number of times on all "
                    "workers.")
            else:
                # Return None if all results are None.
                return None
        first_result = results[0]
        result_type = first_result.type
        if any(r.type != result_type for r in results):
            raise RuntimeError("Some workers returned results with "
                               "different types. Make sure `train.report()` "
                               "and `train.save_checkpoint()` are called the "
                               "same number of times and in the same order on "
                               "each worker.")
        return results

    def fetch_next_result(self) -> Optional[List[Dict]]:
        """Fetch next results produced by ``train.report()`` from each worker.

        Assumes ``start_training`` has already been called.

        Returns:
            A list of dictionaries of values passed to ``train.report()`` from
                each worker. Each item corresponds to an intermediate result
                a single worker. If there are no more items to fetch,
                returns None.
        """

        while True:
            results = self._get_next_results()
            if results is None:
                return None
            first_result = results[0]
            result_type = first_result.type
            if result_type is TrainingResultType.REPORT:
                result_data = [r.data for r in results]
                return result_data
            elif result_type is TrainingResultType.CHECKPOINT:
                self.checkpoint_manager._process_checkpoint(results)
                # Iterate until next REPORT call or training has finished.
            else:
                raise TrainBackendError(f"Unexpected result type: "
                                        f"{result_type}. "
                                        f"Expected one of "
                                        f"{[type in TrainingResultType]}")

    def finish_training(self) -> List[T]:
        """Finish training and return final results. Propagate any exceptions.

        Blocks until training is finished on all workers.

        Assumes `start_training` has already been called.

        Returns:
            A list of return values from calling ``train_func`` on each worker.
                Each item corresponds to the return value from a single worker.
        """
        def pause_reporting():
            # Get the session for this worker.
            try:
                session = get_session()
            except ValueError:
                # Session is not initialized yet.
                raise TrainBackendError("`finish_training` has been called "
                                        "before `start_training`. Please call "
                                        "`start_training` before "
                                        "`finish_training`.")

            return session.pause_reporting()

        def end_training():
            # Get the session for this worker.
            try:
                session = get_session()
            except ValueError:
                # Session is not initialized yet.
                raise TrainBackendError("`finish_training` has been called "
                                        "before `start_training`. Please call "
                                        "`start_training` before "
                                        "`finish_training`.")

            try:
                # session.finish raises any Exceptions from training.
                output = session.finish()
            finally:
                # Shutdown session even if session.finish() raises an
                # Exception.
                shutdown_session()

            return output

        # Disable workers from enqueuing results from `train.report()`.
        # Results will not be processed during the execution of `finish`.
        # Note: Reported results may still be enqueued at this point,
        #       and should be handled appropriately.
        futures = self.worker_group.execute_async(pause_reporting)
        self.get_with_failure_handling(futures)

        # Finish up processing checkpoints. Reporting has been disabled.
        while True:
            results = self._get_next_results()
            if results is None:
                break
            result_type = results[0].type
            # Process checkpoints and ignore other result types.
            if result_type is TrainingResultType.CHECKPOINT:
                self.checkpoint_manager._process_checkpoint(results)

        futures = self.worker_group.execute_async(end_training)
        results = self.get_with_failure_handling(futures)
        return results

    def get_with_failure_handling(self, remote_values):
        """Gets the remote values while handling for worker failures.

        This method should be called instead of ``ray.get()`` directly in
        order to handle worker failures.

        If a worker failure is identified, backend specific failure handling
        is executed and a ``TrainingWorkerError`` is raised.

        Args:
            remote_values (list): List of object refs representing functions
                that may fail in the middle of execution. For example, running
                a Train training loop in multiple parallel actor calls.
        Returns:
            The resolved objects represented by the passed in ObjectRefs.
        """
        success, failed_worker_indexes = check_for_failure(remote_values)
        if success:
            return ray.get(remote_values)
        else:
            self._increment_failures()
            try:
                self._backend.handle_failure(self.worker_group,
                                             failed_worker_indexes,
                                             self._backend_config)
            except RayActorError as exc:
                logger.exception(str(exc))
                self._restart()
            raise TrainingWorkerError

    def shutdown(self):
        """Shuts down the workers in the worker group."""
        try:
            self._backend.on_shutdown(self.worker_group, self._backend_config)
        except RayActorError:
            logger.warning("Graceful shutdown of backend failed. This is "
                           "expected if one of the workers has crashed.")
        self.worker_group.shutdown()
        self.worker_group = InactiveWorkerGroup()
        self.dataset_shards = None

    @property
    def is_started(self):
        return not isinstance(self.worker_group, InactiveWorkerGroup)

    @property
    def latest_checkpoint_dir(self) -> Optional[Path]:
        """Path to the latest checkpoint directory."""
        return self.checkpoint_manager.latest_checkpoint_dir

    @property
    def best_checkpoint_path(self) -> Optional[Path]:
        """Path to the best persisted checkpoint."""
        return self.checkpoint_manager.best_checkpoint_path

    @property
    def latest_checkpoint_id(self) -> Optional[int]:
        """The checkpoint id of most recently saved checkpoint.

        If no checkpoint has been saved yet, then return None.
        """
        checkpoint_id = self.checkpoint_manager._latest_checkpoint_id
        if checkpoint_id == 0:
            return None
        else:
            return checkpoint_id

    @property
    def latest_checkpoint(self) -> Optional[Dict]:
        """Latest checkpoint object."""
        return self.checkpoint_manager.latest_checkpoint

    def _restart(self):
        self.worker_group.shutdown()
        if self._initialization_hook is not None:
            initialization_hook = self._initialization_hook
        else:
            initialization_hook = None
        self.start(initialization_hook=initialization_hook)

    def _increment_failures(self):
        self._num_failures += 1
        if self._num_failures >= self._max_failures:
            raise RuntimeError("Training has failed even after "
                               f"{self._num_failures} "
                               "attempts. You can change the number of max "
                               "failure attempts by setting the "
                               "`max_retries` arg in your `Trainer`.") \
                from None
Esempio n. 6
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        warnings.warn(
            "The `ray.train.Trainer` API will be deprecated in Ray "
            "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR ("
            "https://docs.ray.io/en/latest/ray-air/getting-started.html) will "
            "provide greater functionality than `ray.train.Trainer`, "
            "and with a more flexible and easy-to-use API.",
            PendingDeprecationWarning,
            stacklevel=2,
        )

        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        if not ray.is_initialized():
            ray.init()

        if "GPU" in ray.available_resources() and not use_gpu:
            logger.info("GPUs are detected in your Ray cluster, but GPU "
                        "training is not enabled for Ray Train. To enable "
                        "GPU training, make sure to set `use_gpu` to True "
                        "when instantiating your Trainer.")

        if resources_per_worker is not None:
            # Copy this parameter to avoid mutating the user input
            resources_per_worker = copy.deepcopy(resources_per_worker)

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0
        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker.")
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker.")

        runtime_env = {
            "env_vars": {
                var_name: os.environ[var_name]
                for var_name in BACKEND_ENV_VARS if var_name in os.environ
            }
        }

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        backend_executor_actor = remote_executor.options(
            runtime_env=runtime_env).remote(
                backend_config=self._backend_config,
                num_workers=num_workers,
                num_cpus_per_worker=num_cpus,
                num_gpus_per_worker=num_gpus,
                additional_resources_per_worker=resources_per_worker,
                max_retries=max_retries,
            )

        self._backend_executor = ActorWrapper(backend_executor_actor)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()