Example #1
0
class RayPredictor(BasePredictor):
    def __init__(self, horovod_kwargs, predictor_kwargs):
        # TODO ray: investigate using Dask for prediction instead of Horovod
        setting = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(setting, **{**get_horovod_kwargs(), **horovod_kwargs})
        self.executor.start(executable_cls=RemotePredictor, executable_kwargs=predictor_kwargs)

    def batch_predict(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        results = self.executor.execute(
            lambda predictor: predictor.batch_predict(model.load(), *args, **kwargs)
        )
        return results[0]

    def batch_evaluation(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        results = self.executor.execute(
            lambda predictor: predictor.batch_evaluation(model.load(), *args, **kwargs)
        )
        return results[0]

    def batch_collect_activations(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        return self.executor.execute_single(
            lambda predictor: predictor.batch_collect_activations(model.load(), *args, **kwargs)
        )

    def shutdown(self):
        self.executor.shutdown()
Example #2
0
class RayLegacyTrainer(BaseTrainer):
    def __init__(self, horovod_kwargs, executable_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        setting = RayExecutor.create_settings(timeout_s=30)

        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=RemoteTrainer,
                            executable_kwargs=executable_kwargs)

    def train(self,
              model,
              training_set,
              validation_set=None,
              test_set=None,
              **kwargs):
        workers = self.executor.driver.workers
        train_shards = training_set.pipeline().split(n=len(workers),
                                                     locality_hints=workers,
                                                     equal=True)
        val_shards = (validation_set.pipeline(
            shuffle=False).split(n=len(workers), locality_hints=workers)
                      if validation_set else None)
        test_shards = (test_set.pipeline(shuffle=False).split(
            n=len(workers), locality_hints=workers) if test_set else None)

        results = self.executor.execute(lambda trainer: legacy_train_fn(
            trainer,
            model,
            training_set.training_set_metadata,
            training_set.features,
            train_shards,
            val_shards,
            test_shards,
            **kwargs,
        ))

        return results

    def train_online(self, model, *args, **kwargs):
        results = self.executor.execute(
            lambda trainer: trainer.train_online(model, *args, **kwargs))

        return results[0]

    @property
    def validation_field(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_field)

    @property
    def validation_metric(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_metric)

    def shutdown(self):
        self.executor.shutdown()
Example #3
0
class RayTrainer(BaseTrainer):
    def __init__(self, horovod_kwargs, trainer_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        setting = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=RayRemoteTrainer,
                            executable_kwargs=trainer_kwargs)

    def train(self, model, *args, **kwargs):
        remote_model = RayRemoteModel(model)
        results = self.executor.execute(lambda trainer: trainer.train(
            remote_model.load(), *args, **kwargs))

        weights, *stats = results[0]
        model.set_weights(weights)
        return (model, *stats)

    def train_online(self, model, *args, **kwargs):
        remote_model = RayRemoteModel(model)
        results = self.executor.execute(lambda trainer: trainer.train_online(
            remote_model.load(), *args, **kwargs))

        weights = results[0]
        model.set_weights(weights)
        return model

    @property
    def validation_field(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_field)

    @property
    def validation_metric(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_metric)

    def shutdown(self):
        self.executor.shutdown()
Example #4
0
class _HorovodTrainable(tune.Trainable):
    """Abstract Trainable class for Horovod."""
    # Callable function for training.
    _function = None
    # Number of hosts (nodes) to allocate per trial
    _num_hosts: int = 1
    # Number of workers (slots) to place on each host.
    _num_slots: int = 1
    # Number of CPU resources to reserve for each worker.
    _num_cpus_per_slot: int = 1
    # Whether to reserve and pass GPU resources through.
    _use_gpu: bool = False
    # bool: Whether a the function has completed training
    _finished: bool = False

    # Horovod settings
    _ssh_str: str = None
    _ssh_identity_file: str = None
    _timeout_s: int = 30

    @property
    def num_workers(self):
        return self._num_hosts * self._num_slots

    def setup(self, config: Dict):
        trainable = wrap_function(self.__class__._function)
        # We use a filelock here to ensure that the file-writing
        # process is safe across different trainables.
        if self._ssh_identity_file:
            with FileLock(self._ssh_identity_file + ".lock"):
                settings = RayExecutor.create_settings(self._timeout_s,
                                                       self._ssh_identity_file,
                                                       self._ssh_str)
        else:
            settings = RayExecutor.create_settings(self._timeout_s,
                                                   self._ssh_identity_file,
                                                   self._ssh_str)

        self.executor = RayExecutor(settings,
                                    cpus_per_slot=self._num_cpus_per_slot,
                                    use_gpu=self._use_gpu,
                                    num_hosts=self._num_hosts,
                                    num_slots=self._num_slots)

        # We can't put `self` in the lambda closure, so we
        # resolve the variable ahead of time.
        logdir_ = str(self.logdir)

        # Starts the workers as specified by the resources above.
        self.executor.start(executable_cls=trainable,
                            executable_kwargs={
                                "config":
                                config,
                                "logger_creator":
                                lambda cfg: logger_creator(cfg, logdir_)
                            })

    def step(self) -> Dict:
        if self._finished:
            raise RuntimeError("Training has already finished.")
        result = self.executor.execute(lambda w: w.step())[0]
        if RESULT_DUPLICATE in result:
            self._finished = True
        return result

    def save_checkpoint(self, checkpoint_dir: str) -> str:
        # TODO: optimize if colocated
        save_obj = self.executor.execute_single(lambda w: w.save_to_object())
        checkpoint_path = TrainableUtil.create_from_pickle(
            save_obj, checkpoint_dir)
        return checkpoint_path

    def load_checkpoint(self, checkpoint_dir: str):
        checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir)
        x_id = ray.put(checkpoint_obj)
        return self.executor.execute(lambda w: w.restore_from_object(x_id))

    def stop(self):
        self.executor.execute(lambda w: w.stop())
        self.executor.shutdown()
Example #5
0
        # TensorBoard or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            './checkpoint-{epoch}.h5'))

    # Horovod: write logs on worker 0.
    verbose = 1 if hvd.rank() == 0 else 0

    # Train the model.
    # Horovod: adjust number of steps based on number of GPUs.
    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(),
                    callbacks=callbacks, epochs=num_epochs, verbose=verbose)


ray.init()
settings = RayExecutor.create_settings(timeout_s=30)
executor = RayExecutor(settings, num_hosts=1, num_slots=2, use_gpu=False)
executor.start()
executor.run(train, kwargs=dict(num_epochs=1))
executor.shutdown()
Example #6
0
class HorovodRayPlugin(HorovodPlugin):
    """Pytorch Lightning Plugin for Horovod training on a Ray cluster.

    This plugin is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this plugin, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray_lightning import HorovodRayPlugin

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            plugin = HorovodRayPlugin(num_hosts=2, num_slots=4,
                use_gpu=True)

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
            trainer.fit(ptl_model)

    """

    def __init__(self,
                 num_hosts: int = 1,
                 num_slots: int = 1,
                 use_gpu: bool = False):
        if not HOROVOD_AVAILABLE:
            raise RuntimeError("Please intall Horovod to use this plugin.")
        if not ray.is_initialized():
            ray.init()
        super().__init__()
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu
        self.executor = None

    def __getstate__(self):
        d = self.__dict__.copy()
        del d["executor"]
        return d

    def __setstate__(self, d):
        d["executor"] = None
        self.__dict__.update(d)

    def setup(self, model: LightningModule):
        """Creates the RayExecutor object."""
        self._model = model
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(
            settings,
            num_hosts=self.num_hosts,
            num_slots=self.num_slots,
            use_gpu=self.use_gpu)
        self.executor.start(executable_cls=get_executable_cls())

    def pre_dispatch(self):
        """All pre-dispatch logic should be done in train_remote instead."""
        pass

    def start_training(self, trainer):
        """Main training loop.

        Trigger remote training via ``train_remote`` on each
        worker. If using with Ray Tune, create a communication queue to
        retrieve intermediate results, and process those results. Finally
        retrieve the training results from the rank 0 worker and return."""
        model = self._model
        model_ref = ray.put(model)
        # Don't pickle the model when training remotely.
        self._model = None

        queue = None
        if TUNE_INSTALLED and is_session_enabled():
            # Create communication queue and send to all the workers.
            queue = Queue(actor_options={"num_cpus": 0})

        result_futures = self.executor.run_remote(
            self.train_remote, args=[model_ref, queue])

        results = process_results(result_futures, queue)

        results, state_dict, best_path = results[0]
        self._results = results
        self._model = model
        self._model.load_state_dict(state_dict)
        self._model.trainer.accelerator.training_type_plugin = self
        if self.lightning_module.trainer.checkpoint_callback:
            self.lightning_module.trainer.checkpoint_callback \
                .best_model_path = best_path

        if queue:
            # Shutdown the queue.
            queue.shutdown()

        return results

    def train_remote(self, model: ObjectRef, queue: Queue = None, **kwargs):
        """Training function to be executed on each remote worker."""
        self._model = ray.get(model)
        self.lightning_module.trainer.accelerator_connector\
            ._training_type_plugin = self
        self.lightning_module.trainer.accelerator.training_type_plugin = self

        hvd.init()
        self.global_rank = hvd.rank()
        self.local_rank = hvd.local_rank()
        self.world_size = hvd.size()
        rank_zero_only.rank = self.global_rank

        if queue is not None:
            # Initialize session.
            init_session(rank=self.global_rank, queue=queue)

        # Move the model to the appropriate device.
        super(HorovodRayPlugin, self).model_to_device()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayPlugin, self).pre_dispatch()
        results = super(HorovodRayPlugin,
                        self).start_training(self.lightning_module.trainer)
        if self.global_rank != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.lightning_module.trainer.checkpoint_callback is not None:
            best_model_path = \
                self.lightning_module.trainer.checkpoint_callback\
                    .best_model_path

        return results, self.lightning_module.state_dict(), best_model_path

    def post_dispatch(self):
        """Shuts down the RayExecutor."""
        self.executor.shutdown()

    @property
    def is_distributed(self):
        return True

    @property
    def root_device(self):
        if self.use_gpu and torch.cuda.is_available():
            return torch.device("cuda", hvd.local_rank())
        else:
            return torch.device("cpu")
class HorovodRayAccelerator(HorovodAccelerator):
    """Pytorch Lightning Accelerator for Horovod training on a Ray cluster.

    This accelerator is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this accelerator, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray.util.lightning_accelerators import HorovodRayAccelerator

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4,
                use_gpu=True).

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, accelerator=accelerator).
            trainer.fit(ptl_model).

    """
    def __init__(self,
                 *args,
                 num_hosts=1,
                 num_slots=1,
                 use_gpu=False,
                 **kwargs):
        super().__init__(*args, trainer=None, **kwargs)
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu

    def setup(self, model):
        self.trainer.use_horovod = True
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(settings,
                                    num_hosts=self.num_hosts,
                                    num_slots=self.num_slots,
                                    use_gpu=self.use_gpu)
        self.trainer.model = model
        self.executor.start(executable_cls=get_executable_cls())

    def train(self):
        trainer = self.trainer
        trainer_ref = ray.put(self.trainer)
        self.trainer = None
        results = self.executor.run(self.train_remote, args=[trainer_ref])
        results, state_dict, best_path = results[0]

        self.trainer = trainer
        self.trainer.model.load_state_dict(state_dict)
        if self.trainer.checkpoint_callback:
            self.trainer.checkpoint_callback.best_model_path = best_path

        return results

    def train_remote(self, trainer_ref):
        self.trainer = ray.get(trainer_ref)
        hvd.init()
        if self.trainer.on_gpu:
            # Horovod assigns one local GPU per process.
            self.trainer.root_gpu = hvd.local_rank()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayAccelerator, self).setup(self.trainer.model)
        results = super(HorovodRayAccelerator, self).train()
        if hvd.rank() != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.trainer.checkpoint_callback is not None:
            best_model_path = self.trainer.checkpoint_callback.best_model_path

        model = self.trainer.model
        return results, model.state_dict(), best_model_path

    def teardown(self):
        self.executor.shutdown()