def log_sync_template(options=""): """Template enabling syncs between driver and worker when possible. Requires ray cluster to be started with the autoscaler. Also requires rsync to be installed. Args: options (str): Additional rsync options. Returns: Sync template with source and target parameters. None if rsync unavailable. """ if not distutils.spawn.find_executable("rsync"): if log_once("tune:rsync"): logger.error("Log sync requires rsync to be installed.") return None global _log_sync_warned ssh_key = get_ssh_key() if ssh_key is None: if not _log_sync_warned: logger.debug("Log sync requires cluster to be setup with " "`ray up`.") _log_sync_warned = True return None rsh = "ssh -i {ssh_key} -o ConnectTimeout=120s -o StrictHostKeyChecking=no" rsh = rsh.format(ssh_key=quote(ssh_key)) template = "rsync {options} -savz -e {rsh} {{source}} {{target}}" return template.format(options=options, rsh=quote(rsh))
def sync_to_worker_if_possible(self): """Syncs the local logdir on driver to worker if possible. Requires ray cluster to be started with the autoscaler. Also requires rsync to be installed. """ if self.worker_ip == self.local_ip: return ssh_key = get_ssh_key() ssh_user = get_ssh_user() global _log_sync_warned if ssh_key is None or ssh_user is None: if not _log_sync_warned: logger.error("Log sync requires cluster to be setup with " "`ray up`.") _log_sync_warned = True return if not distutils.spawn.find_executable("rsync"): logger.error("Log sync requires rsync to be installed.") return source = '{}/'.format(self.local_dir) target = '{}@{}:{}/'.format(ssh_user, self.worker_ip, self.local_dir) final_cmd = (("""rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """ """-o StrictHostKeyChecking=no" {} {}""").format( quote(ssh_key), quote(source), quote(target))) logger.info("Syncing results to %s", str(self.worker_ip)) sync_process = subprocess.Popen( final_cmd, shell=True, stdout=self.logfile) sync_process.wait()
def sync_to_worker_if_possible(self): """Syncs the local logdir on driver to worker if possible. Requires ray cluster to be started with the autoscaler. Also requires rsync to be installed. """ if self.worker_ip == self.local_ip: return ssh_key = get_ssh_key() ssh_user = get_ssh_user() global _log_sync_warned if ssh_key is None or ssh_user is None: if not _log_sync_warned: logger.error("Log sync requires cluster to be setup with " "`ray up`.") _log_sync_warned = True return if not distutils.spawn.find_executable("rsync"): logger.error("Log sync requires rsync to be installed.") return source = "{}/".format(self.local_dir) target = "{}@{}:{}/".format(ssh_user, self.worker_ip, self.local_dir) final_cmd = (("""rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """ """-o StrictHostKeyChecking=no" {} {}""").format( quote(ssh_key), quote(source), quote(target))) logger.info("Syncing results to %s", str(self.worker_ip)) sync_process = subprocess.Popen(final_cmd, shell=True, stdout=self.logfile) sync_process.wait()
def sync_now(self, force=False): self.last_sync_time = time.time() if not self.worker_ip: logger.debug("Worker ip unknown, skipping log sync for {}".format( self.local_dir)) return if self.worker_ip == self.local_ip: worker_to_local_sync_cmd = None # don't need to rsync else: ssh_key = get_ssh_key() ssh_user = get_ssh_user() if ssh_key is None or ssh_user is None: logger.error("Log sync requires cluster to be setup with " "`ray create_or_update`.") return if not distutils.spawn.find_executable("rsync"): logger.error("Log sync requires rsync to be installed.") return source = '{}@{}:{}/'.format(ssh_user, self.worker_ip, self.local_dir) target = '{}/'.format(self.local_dir) worker_to_local_sync_cmd = (( """rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """ """-o StrictHostKeyChecking=no" {} {}""").format( quote(ssh_key), quote(source), quote(target))) if self.remote_dir: if self.sync_func: local_to_remote_sync_cmd = None try: self.sync_func(self.local_dir, self.remote_dir) except Exception: logger.exception("Sync function failed.") else: local_to_remote_sync_cmd = self.get_remote_sync_cmd() else: local_to_remote_sync_cmd = None if self.sync_process: self.sync_process.poll() if self.sync_process.returncode is None: if force: self.sync_process.kill() else: logger.warning("Last sync is still in progress, skipping.") return if worker_to_local_sync_cmd or local_to_remote_sync_cmd: final_cmd = "" if worker_to_local_sync_cmd: final_cmd += worker_to_local_sync_cmd if local_to_remote_sync_cmd: if final_cmd: final_cmd += " && " final_cmd += local_to_remote_sync_cmd logger.debug("Running log sync: {}".format(final_cmd)) self.sync_process = subprocess.Popen(final_cmd, shell=True)
def sync_now(self, force=False): self.last_sync_time = time.time() if not self.worker_ip: print("Worker ip unknown, skipping log sync for {}".format( self.local_dir)) return if self.worker_ip == self.local_ip: worker_to_local_sync_cmd = None # don't need to rsync else: ssh_key = get_ssh_key() ssh_user = get_ssh_user() if ssh_key is None or ssh_user is None: print("Error: log sync requires cluster to be setup with " "`ray create_or_update`.") return if not distutils.spawn.find_executable("rsync"): print("Error: log sync requires rsync to be installed.") return worker_to_local_sync_cmd = (( """rsync -avz -e "ssh -i {} -o ConnectTimeout=120s """ """-o StrictHostKeyChecking=no" '{}@{}:{}/' '{}/'""").format( quote(ssh_key), ssh_user, self.worker_ip, quote(self.local_dir), quote(self.local_dir))) if self.remote_dir: if self.remote_dir.startswith(S3_PREFIX): local_to_remote_sync_cmd = ("aws s3 sync {} {}".format( quote(self.local_dir), quote(self.remote_dir))) elif self.remote_dir.startswith(GCS_PREFIX): local_to_remote_sync_cmd = ("gsutil rsync -r {} {}".format( quote(self.local_dir), quote(self.remote_dir))) else: local_to_remote_sync_cmd = None if self.sync_process: self.sync_process.poll() if self.sync_process.returncode is None: if force: self.sync_process.kill() else: print("Warning: last sync is still in progress, skipping") return if worker_to_local_sync_cmd or local_to_remote_sync_cmd: final_cmd = "" if worker_to_local_sync_cmd: final_cmd += worker_to_local_sync_cmd if local_to_remote_sync_cmd: if final_cmd: final_cmd += " && " final_cmd += local_to_remote_sync_cmd print("Running log sync: {}".format(final_cmd)) self.sync_process = subprocess.Popen(final_cmd, shell=True)
def log_sync_template(): """Syncs the local_dir between driver and worker if possible. Requires ray cluster to be started with the autoscaler. Also requires rsync to be installed. """ if not distutils.spawn.find_executable("rsync"): logger.error("Log sync requires rsync to be installed.") return global _log_sync_warned ssh_key = get_ssh_key() if ssh_key is None: if not _log_sync_warned: logger.error("Log sync requires cluster to be setup with " "`ray up`.") _log_sync_warned = True return return ("""rsync -savz -e "ssh -i {ssh_key} -o ConnectTimeout=120s """ """-o StrictHostKeyChecking=no" {{source}} {{target}}""").format( ssh_key=quote(ssh_key))
def DistributedTrainableCreator( func: Callable, use_gpu: bool = False, num_hosts: int = 1, num_slots: int = 1, num_cpus_per_slot: int = 1, timeout_s: int = 30, replicate_pem: bool = False) -> Type[_HorovodTrainable]: """Converts Horovod functions to be executable by Tune. Requires horovod > 0.19 to work. This function wraps and sets the resources for a given Horovod function to be used with Tune. It generates a Horovod Trainable (trial) which can itself be a distributed training job. One basic assumption of this implementation is that all sub-workers of a trial will be placed evenly across different machines. It is recommended that if `num_hosts` per trial > 1, you set num_slots == the size (or number of GPUs) of a single host. If num_hosts == 1, then you can set num_slots to be <= the size (number of GPUs) of a single host. This above assumption can be relaxed - please file a feature request on Github to inform the maintainers. Another assumption is that this API requires gloo as the underlying communication primitive. You will need to install Horovod with `HOROVOD_WITH_GLOO` enabled. *Fault Tolerance:* The trial workers themselves are not fault tolerant. When a host of a trial fails, all workers of a trial are expected to die, and the trial is expected to restart. This currently does not support function checkpointing. Args: func (Callable[[dict], None]): A training function that takes in a config dict for hyperparameters and should initialize horovod via horovod.init. use_gpu (bool); Whether to allocate a GPU per worker. num_cpus_per_slot (int): Number of CPUs to request from Ray per worker. num_hosts (int): Number of hosts that each trial is expected to use. num_slots (int): Number of slots (workers) to start on each host. timeout_s (int): Seconds for Horovod rendezvous to timeout. replicate_pem (bool): THIS MAY BE INSECURE. If true, this will replicate the underlying Ray cluster ssh key across all hosts. This may be useful if using the Ray Autoscaler. Returns: Trainable class that can be passed into `tune.run`. Example: .. code-block:: python def train(config): horovod.init() horovod.allreduce() from ray.tune.integration.horovod import DistributedTrainableCreator trainable_cls = DistributedTrainableCreator( train, num_hosts=1, num_slots=2, use_gpu=True) tune.run(trainable_cls) .. versionadded:: 1.0.0 """ ssh_identity_file = None sshkeystr = None if replicate_pem: from ray.tune.cluster_info import get_ssh_key ssh_identity_file = get_ssh_key() if os.path.exists(ssh_identity_file): # For now, we assume that you're on a Ray cluster. with open(ssh_identity_file) as f: sshkeystr = f.read() class WrappedHorovodTrainable(_HorovodTrainable): _function = func _num_hosts = num_hosts _num_slots = num_slots _num_cpus_per_slot = num_cpus_per_slot _use_gpu = use_gpu _ssh_identity_file = ssh_identity_file _ssh_str = sshkeystr _timeout_s = timeout_s @classmethod def default_resource_request(cls, config: Dict): extra_gpu = int(num_hosts * num_slots) * int(use_gpu) extra_cpu = int(num_hosts * num_slots * num_cpus_per_slot) return Resources( cpu=0, gpu=0, extra_cpu=extra_cpu, extra_gpu=extra_gpu, ) return WrappedHorovodTrainable
def DistributedTrainableCreator( func: Callable[[Dict], None], use_gpu: bool = False, num_hosts: Optional[int] = None, num_workers: int = 1, num_cpus_per_worker: int = 1, timeout_s: int = 30, replicate_pem: bool = False, ) -> Type[_HorovodTrainable]: """Converts Horovod functions to be executable by Tune. Requires horovod > 0.19 to work. This function wraps and sets the resources for a given Horovod function to be used with Tune. It generates a Horovod Trainable (trial) which can itself be a distributed training job. One basic assumption of this implementation is that all sub-workers of a trial will be placed evenly across different machines. It is recommended that if `num_hosts` per trial > 1, you set num_workers == the size (or number of GPUs) of a single host. If num_hosts == 1, then you can set num_workers to be <= the size (number of GPUs) of a single host. This above assumption can be relaxed - please file a feature request on Github to inform the maintainers. Another assumption is that this API requires gloo as the underlying communication primitive. You will need to install Horovod with `HOROVOD_WITH_GLOO` enabled. *Fault Tolerance:* The trial workers themselves are not fault tolerant. When a host of a trial fails, all workers of a trial are expected to die, and the trial is expected to restart. This currently does not support function checkpointing. Args: func: A training function that takes in a config dict for hyperparameters and should initialize horovod via horovod.init. use_gpu: Whether to allocate a GPU per worker. num_cpus_per_worker: Number of CPUs to request from Ray per worker. num_hosts: Number of hosts that each trial is expected to use. num_workers: Number of workers to start on each host. timeout_s: Seconds for Horovod rendezvous to timeout. replicate_pem: THIS MAY BE INSECURE. If true, this will replicate the underlying Ray cluster ssh key across all hosts. This may be useful if using the Ray Autoscaler. Returns: Trainable class that can be passed into `tune.run`. Example: .. code-block:: python def train(config): horovod.init() horovod.allreduce() from ray.tune.integration.horovod import DistributedTrainableCreator trainable_cls = DistributedTrainableCreator( train, num_hosts=1, num_workers=2, use_gpu=True) tune.run(trainable_cls) .. versionadded:: 1.0.0 """ warnings.warn( "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray " "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR (" "https://docs.ray.io/en/latest/ray-air/getting-started.html) will " "provide greater functionality than `DistributedTrainableCreator`, " "and with a more flexible and easy-to-use API.", PendingDeprecationWarning, stacklevel=2, ) ssh_identity_file = None sshkeystr = None if replicate_pem: from ray.tune.cluster_info import get_ssh_key ssh_identity_file = get_ssh_key() if os.path.exists(ssh_identity_file): # For now, we assume that you're on a Ray cluster. with open(ssh_identity_file) as f: sshkeystr = f.read() class WrappedHorovodTrainable(_HorovodTrainable): _function = func _num_hosts = num_hosts _num_workers = num_workers _num_cpus_per_worker = num_cpus_per_worker _use_gpu = use_gpu _ssh_identity_file = ssh_identity_file _ssh_str = sshkeystr _timeout_s = timeout_s @classmethod def default_resource_request(cls, config: Dict): return PlacementGroupFactory( [{}] + [{"CPU": cls._num_cpus_per_worker, "GPU": int(use_gpu)}] * (num_workers) ) return WrappedHorovodTrainable