def setup(self, config: Dict): trainable = wrap_function(self.__class__._function) # We use a filelock here to ensure that the file-writing # process is safe across different trainables. if self._ssh_identity_file: with FileLock(self._ssh_identity_file + ".lock"): settings = RayExecutor.create_settings(self._timeout_s, self._ssh_identity_file, self._ssh_str) else: settings = RayExecutor.create_settings(self._timeout_s, self._ssh_identity_file, self._ssh_str) self.executor = RayExecutor(settings, cpus_per_slot=self._num_cpus_per_slot, use_gpu=self._use_gpu, num_hosts=self._num_hosts, num_slots=self._num_slots) # We can't put `self` in the lambda closure, so we # resolve the variable ahead of time. logdir_ = str(self.logdir) # Starts the workers as specified by the resources above. self.executor.start(executable_cls=trainable, executable_kwargs={ "config": config, "logger_creator": lambda cfg: logger_creator(cfg, logdir_) })
def __init__(self, horovod_kwargs, trainer_kwargs): # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc. setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( setting, **{**get_horovod_kwargs(), **horovod_kwargs}) self.executor.start(executable_cls=RayRemoteTrainer, executable_kwargs=trainer_kwargs)
def setup(self, model: LightningModule): """Creates the RayExecutor object.""" self._model = model settings = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor(settings, num_hosts=self.num_hosts, num_slots=self.num_slots, use_gpu=self.use_gpu) self.executor.start(executable_cls=get_executable_cls())
def setup(self, model): self.trainer.use_horovod = True settings = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor(settings, num_hosts=self.num_hosts, num_slots=self.num_slots, use_gpu=self.use_gpu) self.trainer.model = model self.executor.start(executable_cls=get_executable_cls())
def __init__(self, horovod_kwargs, predictor_kwargs): # TODO ray: investigate using Dask for prediction instead of Horovod setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( setting, **{ **get_horovod_kwargs(), **horovod_kwargs }) self.executor.start(executable_cls=RemotePredictor, executable_kwargs=predictor_kwargs)
def main(num_workers, use_gpu, timeout_s=30, placement_group_timeout_s=100, kwargs=None): kwargs = kwargs or {} if use_gpu: kwargs["use_cuda"] = True settings = RayExecutor.create_settings( timeout_s=timeout_s, placement_group_timeout_s=placement_group_timeout_s) executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers) executor.start() executor.run(train_fn, kwargs=kwargs)
def __init__(self, horovod_kwargs, executable_kwargs): # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc. if RayExecutor is None: logger.error( "RayLegacyTrainer failed to initialize: RayExecutor is None. Make sure horovod[ray] is installed." ) return setting = RayExecutor.create_settings(timeout_s=30) self.executor = RayExecutor( setting, **{ **get_horovod_kwargs(), **horovod_kwargs }) self.executor.start(executable_cls=HorovodRemoteTrainer, executable_kwargs=executable_kwargs)
def start_executor(self): # Ray executor settings setting = RayExecutor.create_settings(timeout_s=100) num_hosts = 1 # number of machine to use num_slots = self.num_slots # number of workers to use on each machine cpus_per_slot = 1 # number of cores to allocate to each worker gpus_per_slot = 1 # number of GPUs to allocate to each worker use_gpu = gpus_per_slot > 0 # Start num_hosts * num_slots actors on the cluster # https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api executor = RayExecutor( setting, num_hosts=num_hosts, num_slots=num_slots, cpus_per_slot=cpus_per_slot, gpus_per_slot=gpus_per_slot, use_gpu=use_gpu ) # Launch the Ray actors on each machine # This will launch `num_slots` actors on each machine executor.start() return executor
for batch_idx, data in enumerate(train_loader): feature = data[:-1] target = data[-1] optimizer.zero_grad() output = model(*feature) loss = F.smooth_l1_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} \tLoss: {:.6f}'.format( epoch, loss.item())) for epoch in range(1, args.epochs + 1): train(epoch) if __name__ == '__main__': # connect to ray cluster import ray # ray.init(address='auto') ray.init() torch_ds, num_features = process_data() # Start horovod workers on Ray from horovod.ray import RayExecutor settings = RayExecutor.create_settings(500) executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1) executor.start() executor.run(train_fn, args=[torch_ds, num_features]) raydp.stop_spark() ray.shutdown()
# TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=3, initial_lr=scaled_lr, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint( './checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 # Train the model. # Horovod: adjust number of steps based on number of GPUs. mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=num_epochs, verbose=verbose) ray.init() settings = RayExecutor.create_settings(timeout_s=30) executor = RayExecutor(settings, num_hosts=1, num_slots=2, use_gpu=False) executor.start() executor.run(train, kwargs=dict(num_epochs=1)) executor.shutdown()
import socket import ray import horovod.tensorflow.keras as hvd # import horovod.tensorflow as hvd from horovod.ray import RayExecutor # Start the Ray cluster or attach to an existing Ray cluster ray.init(address="auto") # Ray executor settings setting = RayExecutor.create_settings(timeout_s=100) num_hosts = 1 # number of machine to use num_slots = 6 # number of workers to use on each machine cpus_per_slot = 1 # number of cores to allocate to each worker gpus_per_slot = 1 # number of GPUs to allocate to each worker # Start num_hosts * num_slots actors on the cluster # https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api executor = RayExecutor(setting, num_hosts=num_hosts, num_slots=num_slots, cpus_per_slot=cpus_per_slot, gpus_per_slot=gpus_per_slot, use_gpu=True) # Launch the Ray actors on each machine # This will launch `num_slots` actors on each machine print("Start executor...", end="", flush=True) executor.start() print("OK", flush=True)
def main(num_workers, use_gpu, **kwargs): settings = RayExecutor.create_settings(timeout_s=30) executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers) executor.run(train_fn, kwargs=kwargs)