class RayPredictor(BasePredictor): def __init__(self, horovod_kwargs, predictor_kwargs): # TODO ray: investigate using Dask for prediction instead of Horovod setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor(setting, **{**get_horovod_kwargs(), **horovod_kwargs}) self.executor.start(executable_cls=RemotePredictor, executable_kwargs=predictor_kwargs) def batch_predict(self, model, *args, **kwargs): model = RayRemoteModel(model) results = self.executor.execute( lambda predictor: predictor.batch_predict(model.load(), *args, **kwargs) ) return results[0] def batch_evaluation(self, model, *args, **kwargs): model = RayRemoteModel(model) results = self.executor.execute( lambda predictor: predictor.batch_evaluation(model.load(), *args, **kwargs) ) return results[0] def batch_collect_activations(self, model, *args, **kwargs): model = RayRemoteModel(model) return self.executor.execute_single( lambda predictor: predictor.batch_collect_activations(model.load(), *args, **kwargs) ) def shutdown(self): self.executor.shutdown()
class RayLegacyTrainer(BaseTrainer): def __init__(self, horovod_kwargs, executable_kwargs): # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc. setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( setting, **{ **get_horovod_kwargs(), **horovod_kwargs }) self.executor.start(executable_cls=RemoteTrainer, executable_kwargs=executable_kwargs) def train(self, model, training_set, validation_set=None, test_set=None, **kwargs): workers = self.executor.driver.workers train_shards = training_set.pipeline().split(n=len(workers), locality_hints=workers, equal=True) val_shards = (validation_set.pipeline( shuffle=False).split(n=len(workers), locality_hints=workers) if validation_set else None) test_shards = (test_set.pipeline(shuffle=False).split( n=len(workers), locality_hints=workers) if test_set else None) results = self.executor.execute(lambda trainer: legacy_train_fn( trainer, model, training_set.training_set_metadata, training_set.features, train_shards, val_shards, test_shards, **kwargs, )) return results def train_online(self, model, *args, **kwargs): results = self.executor.execute( lambda trainer: trainer.train_online(model, *args, **kwargs)) return results[0] @property def validation_field(self): return self.executor.execute_single( lambda trainer: trainer.validation_field) @property def validation_metric(self): return self.executor.execute_single( lambda trainer: trainer.validation_metric) def shutdown(self): self.executor.shutdown()
def main(num_workers, use_gpu, timeout_s=30, placement_group_timeout_s=100, kwargs=None): kwargs = kwargs or {} if use_gpu: kwargs["use_cuda"] = True settings = RayExecutor.create_settings( timeout_s=timeout_s, placement_group_timeout_s=placement_group_timeout_s) executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers) executor.start() executor.run(train_fn, kwargs=kwargs)
class RayTrainer(BaseTrainer): def __init__(self, horovod_kwargs, trainer_kwargs): # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc. setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( setting, **{ **get_horovod_kwargs(), **horovod_kwargs }) self.executor.start(executable_cls=RayRemoteTrainer, executable_kwargs=trainer_kwargs) def train(self, model, *args, **kwargs): remote_model = RayRemoteModel(model) results = self.executor.execute(lambda trainer: trainer.train( remote_model.load(), *args, **kwargs)) weights, *stats = results[0] model.set_weights(weights) return (model, *stats) def train_online(self, model, *args, **kwargs): remote_model = RayRemoteModel(model) results = self.executor.execute(lambda trainer: trainer.train_online( remote_model.load(), *args, **kwargs)) weights = results[0] model.set_weights(weights) return model @property def validation_field(self): return self.executor.execute_single( lambda trainer: trainer.validation_field) @property def validation_metric(self): return self.executor.execute_single( lambda trainer: trainer.validation_metric) def shutdown(self): self.executor.shutdown()
def start_executor(self): # Ray executor settings setting = RayExecutor.create_settings(timeout_s=100) num_hosts = 1 # number of machine to use num_slots = self.num_slots # number of workers to use on each machine cpus_per_slot = 1 # number of cores to allocate to each worker gpus_per_slot = 1 # number of GPUs to allocate to each worker use_gpu = gpus_per_slot > 0 # Start num_hosts * num_slots actors on the cluster # https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api executor = RayExecutor( setting, num_hosts=num_hosts, num_slots=num_slots, cpus_per_slot=cpus_per_slot, gpus_per_slot=gpus_per_slot, use_gpu=use_gpu ) # Launch the Ray actors on each machine # This will launch `num_slots` actors on each machine executor.start() return executor
class _HorovodTrainable(tune.Trainable): """Abstract Trainable class for Horovod.""" # Callable function for training. _function = None # Number of hosts (nodes) to allocate per trial _num_hosts: int = 1 # Number of workers (slots) to place on each host. _num_slots: int = 1 # Number of CPU resources to reserve for each worker. _num_cpus_per_slot: int = 1 # Whether to reserve and pass GPU resources through. _use_gpu: bool = False # bool: Whether a the function has completed training _finished: bool = False # Horovod settings _ssh_str: str = None _ssh_identity_file: str = None _timeout_s: int = 30 @property def num_workers(self): return self._num_hosts * self._num_slots def setup(self, config: Dict): trainable = wrap_function(self.__class__._function) # We use a filelock here to ensure that the file-writing # process is safe across different trainables. if self._ssh_identity_file: with FileLock(self._ssh_identity_file + ".lock"): settings = RayExecutor.create_settings(self._timeout_s, self._ssh_identity_file, self._ssh_str) else: settings = RayExecutor.create_settings(self._timeout_s, self._ssh_identity_file, self._ssh_str) self.executor = RayExecutor(settings, cpus_per_slot=self._num_cpus_per_slot, use_gpu=self._use_gpu, num_hosts=self._num_hosts, num_slots=self._num_slots) # We can't put `self` in the lambda closure, so we # resolve the variable ahead of time. logdir_ = str(self.logdir) # Starts the workers as specified by the resources above. self.executor.start(executable_cls=trainable, executable_kwargs={ "config": config, "logger_creator": lambda cfg: logger_creator(cfg, logdir_) }) def step(self) -> Dict: if self._finished: raise RuntimeError("Training has already finished.") result = self.executor.execute(lambda w: w.step())[0] if RESULT_DUPLICATE in result: self._finished = True return result def save_checkpoint(self, checkpoint_dir: str) -> str: # TODO: optimize if colocated save_obj = self.executor.execute_single(lambda w: w.save_to_object()) checkpoint_path = TrainableUtil.create_from_pickle( save_obj, checkpoint_dir) return checkpoint_path def load_checkpoint(self, checkpoint_dir: str): checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir) x_id = ray.put(checkpoint_obj) return self.executor.execute(lambda w: w.restore_from_object(x_id)) def stop(self): self.executor.execute(lambda w: w.stop()) self.executor.shutdown()
for batch_idx, data in enumerate(train_loader): feature = data[:-1] target = data[-1] optimizer.zero_grad() output = model(*feature) loss = F.smooth_l1_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} \tLoss: {:.6f}'.format( epoch, loss.item())) for epoch in range(1, args.epochs + 1): train(epoch) if __name__ == '__main__': # connect to ray cluster import ray # ray.init(address='auto') ray.init() torch_ds, num_features = process_data() # Start horovod workers on Ray from horovod.ray import RayExecutor settings = RayExecutor.create_settings(500) executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1) executor.start() executor.run(train_fn, args=[torch_ds, num_features]) raydp.stop_spark() ray.shutdown()
class HorovodRayPlugin(HorovodPlugin): """Pytorch Lightning Plugin for Horovod training on a Ray cluster. This plugin is used to manage distributed training on a Ray cluster via the Horovod training framework. Internally, the specified number of Ray actors are launched in the cluster and are configured as part of the Horovod ring. The Pytorch Lightning trainer is instantiated on the driver and sent to each of these training workers where training is executed. The distributed training protocol is handled by Horovod. Each training worker is configured to reserve 1 CPU and if 1 GPU if ``use_gpu`` is set to ``True``. If using this plugin, you should run your code like a normal Python script: ``python train.py``, and not with ``horovodrun``. Args: num_hosts (int): The number of nodes/machines to execute the job on. num_slots (int): Number of workers to be placed on each machine. use_gpu (bool): Whether to use GPU for allocation. For GPU to be used, you must also set the ``gpus`` arg in your Pytorch Lightning Trainer to a value > 0. Example: .. code_block:: python import pytorch_lightning as ptl from ray_lightning import HorovodRayPlugin ptl_model = MNISTClassifier(...) # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU. plugin = HorovodRayPlugin(num_hosts=2, num_slots=4, use_gpu=True) # If using GPUs, set the ``gpus`` arg to a value > 0. # The actual number of GPUs is determined by ``num_slots``. trainer = pl.Trainer(..., gpus=1, plugins=[plugin]) trainer.fit(ptl_model) """ def __init__(self, num_hosts: int = 1, num_slots: int = 1, use_gpu: bool = False): if not HOROVOD_AVAILABLE: raise RuntimeError("Please intall Horovod to use this plugin.") if not ray.is_initialized(): ray.init() super().__init__() self.nickname = "horovod_ray" self.num_hosts = num_hosts self.num_slots = num_slots self.use_gpu = use_gpu self.executor = None def __getstate__(self): d = self.__dict__.copy() del d["executor"] return d def __setstate__(self, d): d["executor"] = None self.__dict__.update(d) def setup(self, model: LightningModule): """Creates the RayExecutor object.""" self._model = model settings = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( settings, num_hosts=self.num_hosts, num_slots=self.num_slots, use_gpu=self.use_gpu) self.executor.start(executable_cls=get_executable_cls()) def pre_dispatch(self): """All pre-dispatch logic should be done in train_remote instead.""" pass def start_training(self, trainer): """Main training loop. Trigger remote training via ``train_remote`` on each worker. If using with Ray Tune, create a communication queue to retrieve intermediate results, and process those results. Finally retrieve the training results from the rank 0 worker and return.""" model = self._model model_ref = ray.put(model) # Don't pickle the model when training remotely. self._model = None queue = None if TUNE_INSTALLED and is_session_enabled(): # Create communication queue and send to all the workers. queue = Queue(actor_options={"num_cpus": 0}) result_futures = self.executor.run_remote( self.train_remote, args=[model_ref, queue]) results = process_results(result_futures, queue) results, state_dict, best_path = results[0] self._results = results self._model = model self._model.load_state_dict(state_dict) self._model.trainer.accelerator.training_type_plugin = self if self.lightning_module.trainer.checkpoint_callback: self.lightning_module.trainer.checkpoint_callback \ .best_model_path = best_path if queue: # Shutdown the queue. queue.shutdown() return results def train_remote(self, model: ObjectRef, queue: Queue = None, **kwargs): """Training function to be executed on each remote worker.""" self._model = ray.get(model) self.lightning_module.trainer.accelerator_connector\ ._training_type_plugin = self self.lightning_module.trainer.accelerator.training_type_plugin = self hvd.init() self.global_rank = hvd.rank() self.local_rank = hvd.local_rank() self.world_size = hvd.size() rank_zero_only.rank = self.global_rank if queue is not None: # Initialize session. init_session(rank=self.global_rank, queue=queue) # Move the model to the appropriate device. super(HorovodRayPlugin, self).model_to_device() # TODO: Make changes in PTL to clean this up. super(HorovodRayPlugin, self).pre_dispatch() results = super(HorovodRayPlugin, self).start_training(self.lightning_module.trainer) if self.global_rank != 0: # Only want results from the first worker. return None best_model_path = None if self.lightning_module.trainer.checkpoint_callback is not None: best_model_path = \ self.lightning_module.trainer.checkpoint_callback\ .best_model_path return results, self.lightning_module.state_dict(), best_model_path def post_dispatch(self): """Shuts down the RayExecutor.""" self.executor.shutdown() @property def is_distributed(self): return True @property def root_device(self): if self.use_gpu and torch.cuda.is_available(): return torch.device("cuda", hvd.local_rank()) else: return torch.device("cpu")
class HorovodRayAccelerator(HorovodAccelerator): """Pytorch Lightning Accelerator for Horovod training on a Ray cluster. This accelerator is used to manage distributed training on a Ray cluster via the Horovod training framework. Internally, the specified number of Ray actors are launched in the cluster and are configured as part of the Horovod ring. The Pytorch Lightning trainer is instantiated on the driver and sent to each of these training workers where training is executed. The distributed training protocol is handled by Horovod. Each training worker is configured to reserve 1 CPU and if 1 GPU if ``use_gpu`` is set to ``True``. If using this accelerator, you should run your code like a normal Python script: ``python train.py``, and not with ``horovodrun``. Args: num_hosts (int): The number of nodes/machines to execute the job on. num_slots (int): Number of workers to be placed on each machine. use_gpu (bool): Whether to use GPU for allocation. For GPU to be used, you must also set the ``gpus`` arg in your Pytorch Lightning Trainer to a value > 0. Example: .. code_block:: python import pytorch_lightning as ptl from ray.util.lightning_accelerators import HorovodRayAccelerator ptl_model = MNISTClassifier(...) # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU. accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4, use_gpu=True). # If using GPUs, set the ``gpus`` arg to a value > 0. # The actual number of GPUs is determined by ``num_slots``. trainer = pl.Trainer(..., gpus=1, accelerator=accelerator). trainer.fit(ptl_model). """ def __init__(self, *args, num_hosts=1, num_slots=1, use_gpu=False, **kwargs): super().__init__(*args, trainer=None, **kwargs) self.nickname = "horovod_ray" self.num_hosts = num_hosts self.num_slots = num_slots self.use_gpu = use_gpu def setup(self, model): self.trainer.use_horovod = True settings = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor(settings, num_hosts=self.num_hosts, num_slots=self.num_slots, use_gpu=self.use_gpu) self.trainer.model = model self.executor.start(executable_cls=get_executable_cls()) def train(self): trainer = self.trainer trainer_ref = ray.put(self.trainer) self.trainer = None results = self.executor.run(self.train_remote, args=[trainer_ref]) results, state_dict, best_path = results[0] self.trainer = trainer self.trainer.model.load_state_dict(state_dict) if self.trainer.checkpoint_callback: self.trainer.checkpoint_callback.best_model_path = best_path return results def train_remote(self, trainer_ref): self.trainer = ray.get(trainer_ref) hvd.init() if self.trainer.on_gpu: # Horovod assigns one local GPU per process. self.trainer.root_gpu = hvd.local_rank() # TODO: Make changes in PTL to clean this up. super(HorovodRayAccelerator, self).setup(self.trainer.model) results = super(HorovodRayAccelerator, self).train() if hvd.rank() != 0: # Only want results from the first worker. return None best_model_path = None if self.trainer.checkpoint_callback is not None: best_model_path = self.trainer.checkpoint_callback.best_model_path model = self.trainer.model return results, model.state_dict(), best_model_path def teardown(self): self.executor.shutdown()