Beispiel #1
0
    def __init__(self,
                 backend_config: BackendConfig,
                 num_workers: int = 1,
                 num_cpus_per_worker: float = 1,
                 num_gpus_per_worker: float = 0,
                 additional_resources_per_worker: Optional[Dict[str,
                                                                float]] = None,
                 max_retries: int = 3):
        self._backend_config = backend_config
        self._backend = self._backend_config.backend_cls()
        self._num_workers = num_workers
        self._num_cpus_per_worker = num_cpus_per_worker
        self._num_gpus_per_worker = num_gpus_per_worker
        self._additional_resources_per_worker = additional_resources_per_worker
        self._max_failures = max_retries
        if self._max_failures < 0:
            self._max_failures = float("inf")
        self._num_failures = 0
        self._initialization_hook = None

        if tune is not None and tune.is_session_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()

        self.worker_group = InactiveWorkerGroup()
        self.dataset_shards = None

        self.checkpoint_manager.on_init()
Beispiel #2
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0

        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker.")
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker.")

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        self._backend_executor_actor = remote_executor.remote(
            backend_config=self._backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()
Beispiel #3
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        if not ray.is_initialized():
            ray.init()

        if "GPU" in ray.available_resources() and not use_gpu:
            logger.info(
                "GPUs are detected in your Ray cluster, but GPU "
                "training is not enabled for Ray Train. To enable "
                "GPU training, make sure to set `use_gpu` to True "
                "when instantiating your Trainer."
            )

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0
        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker."
                )
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker."
                )

        runtime_env = {
            "env_vars": {
                var_name: os.environ[var_name]
                for var_name in BACKEND_ENV_VARS
                if var_name in os.environ
            }
        }

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        backend_executor_actor = remote_executor.options(
            runtime_env=runtime_env
        ).remote(
            backend_config=self._backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries,
        )

        self._backend_executor = ActorWrapper(backend_executor_actor)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()
Beispiel #4
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):
        warnings.warn(
            "The `ray.train.Trainer` API will be deprecated in Ray "
            "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR ("
            "https://docs.ray.io/en/latest/ray-air/getting-started.html) will "
            "provide greater functionality than `ray.train.Trainer`, "
            "and with a more flexible and easy-to-use API.",
            PendingDeprecationWarning,
            stacklevel=2,
        )

        if num_workers <= 0:
            raise ValueError("`num_workers` must be a positive integer.")

        if not ray.is_initialized():
            ray.init()

        if "GPU" in ray.available_resources() and not use_gpu:
            logger.info("GPUs are detected in your Ray cluster, but GPU "
                        "training is not enabled for Ray Train. To enable "
                        "GPU training, make sure to set `use_gpu` to True "
                        "when instantiating your Trainer.")

        if resources_per_worker is not None:
            # Copy this parameter to avoid mutating the user input
            resources_per_worker = copy.deepcopy(resources_per_worker)

        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0
        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        self._backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker.")
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker.")

        runtime_env = {
            "env_vars": {
                var_name: os.environ[var_name]
                for var_name in BACKEND_ENV_VARS if var_name in os.environ
            }
        }

        remote_executor = ray.remote(num_cpus=0)(BackendExecutor)

        backend_executor_actor = remote_executor.options(
            runtime_env=runtime_env).remote(
                backend_config=self._backend_config,
                num_workers=num_workers,
                num_cpus_per_worker=num_cpus,
                num_gpus_per_worker=num_gpus,
                additional_resources_per_worker=resources_per_worker,
                max_retries=max_retries,
            )

        self._backend_executor = ActorWrapper(backend_executor_actor)

        if self._is_tune_enabled():
            self.checkpoint_manager = TuneCheckpointManager()
        else:
            self.checkpoint_manager = CheckpointManager()
        self.checkpoint_manager.on_init()