Example #1
0
File: ray.py Project: cxz/ludwig
 def __init__(self, horovod_kwargs, trainer_kwargs):
     # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
     setting = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(
         setting, **{**get_horovod_kwargs(), **horovod_kwargs})
     self.executor.start(executable_cls=RayRemoteTrainer,
                         executable_kwargs=trainer_kwargs)
Example #2
0
    def setup(self, config: Dict):
        trainable = wrap_function(self.__class__._function)
        # We use a filelock here to ensure that the file-writing
        # process is safe across different trainables.
        if self._ssh_identity_file:
            with FileLock(self._ssh_identity_file + ".lock"):
                settings = RayExecutor.create_settings(self._timeout_s,
                                                       self._ssh_identity_file,
                                                       self._ssh_str)
        else:
            settings = RayExecutor.create_settings(self._timeout_s,
                                                   self._ssh_identity_file,
                                                   self._ssh_str)

        self.executor = RayExecutor(settings,
                                    cpus_per_slot=self._num_cpus_per_slot,
                                    use_gpu=self._use_gpu,
                                    num_hosts=self._num_hosts,
                                    num_slots=self._num_slots)

        # We can't put `self` in the lambda closure, so we
        # resolve the variable ahead of time.
        logdir_ = str(self.logdir)

        # Starts the workers as specified by the resources above.
        self.executor.start(executable_cls=trainable,
                            executable_kwargs={
                                "config":
                                config,
                                "logger_creator":
                                lambda cfg: logger_creator(cfg, logdir_)
                            })
 def setup(self, model: LightningModule):
     """Creates the RayExecutor object."""
     self._model = model
     settings = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(settings,
                                 num_hosts=self.num_hosts,
                                 num_slots=self.num_slots,
                                 use_gpu=self.use_gpu)
     self.executor.start(executable_cls=get_executable_cls())
 def setup(self, model):
     self.trainer.use_horovod = True
     settings = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(settings,
                                 num_hosts=self.num_hosts,
                                 num_slots=self.num_slots,
                                 use_gpu=self.use_gpu)
     self.trainer.model = model
     self.executor.start(executable_cls=get_executable_cls())
Example #5
0
 def __init__(self, horovod_kwargs, predictor_kwargs):
     # TODO ray: investigate using Dask for prediction instead of Horovod
     setting = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(
         setting, **{
             **get_horovod_kwargs(),
             **horovod_kwargs
         })
     self.executor.start(executable_cls=RemotePredictor,
                         executable_kwargs=predictor_kwargs)
Example #6
0
class RayLegacyTrainer(BaseTrainer):
    def __init__(self, horovod_kwargs, executable_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        setting = RayExecutor.create_settings(timeout_s=30)

        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=RemoteTrainer,
                            executable_kwargs=executable_kwargs)

    def train(self,
              model,
              training_set,
              validation_set=None,
              test_set=None,
              **kwargs):
        workers = self.executor.driver.workers
        train_shards = training_set.pipeline().split(n=len(workers),
                                                     locality_hints=workers,
                                                     equal=True)
        val_shards = (validation_set.pipeline(
            shuffle=False).split(n=len(workers), locality_hints=workers)
                      if validation_set else None)
        test_shards = (test_set.pipeline(shuffle=False).split(
            n=len(workers), locality_hints=workers) if test_set else None)

        results = self.executor.execute(lambda trainer: legacy_train_fn(
            trainer,
            model,
            training_set.training_set_metadata,
            training_set.features,
            train_shards,
            val_shards,
            test_shards,
            **kwargs,
        ))

        return results

    def train_online(self, model, *args, **kwargs):
        results = self.executor.execute(
            lambda trainer: trainer.train_online(model, *args, **kwargs))

        return results[0]

    @property
    def validation_field(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_field)

    @property
    def validation_metric(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_metric)

    def shutdown(self):
        self.executor.shutdown()
Example #7
0
    def __init__(self, horovod_kwargs, executable_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        if RayExecutor is None:
            logger.error(
                "RayLegacyTrainer failed to initialize: RayExecutor is None. Make sure horovod[ray] is installed."
            )
            return
        setting = RayExecutor.create_settings(timeout_s=30)

        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=HorovodRemoteTrainer,
                            executable_kwargs=executable_kwargs)
Example #8
0
class RayPredictor(BasePredictor):
    def __init__(self, horovod_kwargs, predictor_kwargs):
        # TODO ray: investigate using Dask for prediction instead of Horovod
        setting = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(setting, **{**get_horovod_kwargs(), **horovod_kwargs})
        self.executor.start(executable_cls=RemotePredictor, executable_kwargs=predictor_kwargs)

    def batch_predict(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        results = self.executor.execute(
            lambda predictor: predictor.batch_predict(model.load(), *args, **kwargs)
        )
        return results[0]

    def batch_evaluation(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        results = self.executor.execute(
            lambda predictor: predictor.batch_evaluation(model.load(), *args, **kwargs)
        )
        return results[0]

    def batch_collect_activations(self, model, *args, **kwargs):
        model = RayRemoteModel(model)
        return self.executor.execute_single(
            lambda predictor: predictor.batch_collect_activations(model.load(), *args, **kwargs)
        )

    def shutdown(self):
        self.executor.shutdown()
Example #9
0
    def start_executor(self):
        # Ray executor settings
        setting = RayExecutor.create_settings(timeout_s=100)
        num_hosts = 1  # number of machine to use
        num_slots = self.num_slots  # number of workers to use on each machine
        cpus_per_slot = 1  # number of cores to allocate to each worker
        gpus_per_slot = 1  # number of GPUs to allocate to each worker
        use_gpu = gpus_per_slot > 0

        # Start num_hosts * num_slots actors on the cluster
        # https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api
        executor = RayExecutor(
            setting,
            num_hosts=num_hosts,
            num_slots=num_slots,
            cpus_per_slot=cpus_per_slot,
            gpus_per_slot=gpus_per_slot,
            use_gpu=use_gpu
        )

        # Launch the Ray actors on each machine
        # This will launch `num_slots` actors on each machine
        executor.start()
        return executor
Example #10
0
def main(num_workers,
         use_gpu,
         timeout_s=30,
         placement_group_timeout_s=100,
         kwargs=None):
    kwargs = kwargs or {}
    if use_gpu:
        kwargs["use_cuda"] = True
    settings = RayExecutor.create_settings(
        timeout_s=timeout_s,
        placement_group_timeout_s=placement_group_timeout_s)
    executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
    executor.start()
    executor.run(train_fn, kwargs=kwargs)
Example #11
0
class RayTrainer(BaseTrainer):
    def __init__(self, horovod_kwargs, trainer_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        setting = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=RayRemoteTrainer,
                            executable_kwargs=trainer_kwargs)

    def train(self, model, *args, **kwargs):
        remote_model = RayRemoteModel(model)
        results = self.executor.execute(lambda trainer: trainer.train(
            remote_model.load(), *args, **kwargs))

        weights, *stats = results[0]
        model.set_weights(weights)
        return (model, *stats)

    def train_online(self, model, *args, **kwargs):
        remote_model = RayRemoteModel(model)
        results = self.executor.execute(lambda trainer: trainer.train_online(
            remote_model.load(), *args, **kwargs))

        weights = results[0]
        model.set_weights(weights)
        return model

    @property
    def validation_field(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_field)

    @property
    def validation_metric(self):
        return self.executor.execute_single(
            lambda trainer: trainer.validation_metric)

    def shutdown(self):
        self.executor.shutdown()
class HorovodRayAccelerator(HorovodAccelerator):
    """Pytorch Lightning Accelerator for Horovod training on a Ray cluster.

    This accelerator is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this accelerator, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray.util.lightning_accelerators import HorovodRayAccelerator

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4,
                use_gpu=True).

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, accelerator=accelerator).
            trainer.fit(ptl_model).

    """
    def __init__(self,
                 *args,
                 num_hosts=1,
                 num_slots=1,
                 use_gpu=False,
                 **kwargs):
        super().__init__(*args, trainer=None, **kwargs)
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu

    def setup(self, model):
        self.trainer.use_horovod = True
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(settings,
                                    num_hosts=self.num_hosts,
                                    num_slots=self.num_slots,
                                    use_gpu=self.use_gpu)
        self.trainer.model = model
        self.executor.start(executable_cls=get_executable_cls())

    def train(self):
        trainer = self.trainer
        trainer_ref = ray.put(self.trainer)
        self.trainer = None
        results = self.executor.run(self.train_remote, args=[trainer_ref])
        results, state_dict, best_path = results[0]

        self.trainer = trainer
        self.trainer.model.load_state_dict(state_dict)
        if self.trainer.checkpoint_callback:
            self.trainer.checkpoint_callback.best_model_path = best_path

        return results

    def train_remote(self, trainer_ref):
        self.trainer = ray.get(trainer_ref)
        hvd.init()
        if self.trainer.on_gpu:
            # Horovod assigns one local GPU per process.
            self.trainer.root_gpu = hvd.local_rank()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayAccelerator, self).setup(self.trainer.model)
        results = super(HorovodRayAccelerator, self).train()
        if hvd.rank() != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.trainer.checkpoint_callback is not None:
            best_model_path = self.trainer.checkpoint_callback.best_model_path

        model = self.trainer.model
        return results, model.state_dict(), best_model_path

    def teardown(self):
        self.executor.shutdown()
Example #13
0
class _HorovodTrainable(tune.Trainable):
    """Abstract Trainable class for Horovod."""
    # Callable function for training.
    _function = None
    # Number of hosts (nodes) to allocate per trial
    _num_hosts: int = 1
    # Number of workers (slots) to place on each host.
    _num_slots: int = 1
    # Number of CPU resources to reserve for each worker.
    _num_cpus_per_slot: int = 1
    # Whether to reserve and pass GPU resources through.
    _use_gpu: bool = False
    # bool: Whether a the function has completed training
    _finished: bool = False

    # Horovod settings
    _ssh_str: str = None
    _ssh_identity_file: str = None
    _timeout_s: int = 30

    @property
    def num_workers(self):
        return self._num_hosts * self._num_slots

    def setup(self, config: Dict):
        trainable = wrap_function(self.__class__._function)
        # We use a filelock here to ensure that the file-writing
        # process is safe across different trainables.
        if self._ssh_identity_file:
            with FileLock(self._ssh_identity_file + ".lock"):
                settings = RayExecutor.create_settings(self._timeout_s,
                                                       self._ssh_identity_file,
                                                       self._ssh_str)
        else:
            settings = RayExecutor.create_settings(self._timeout_s,
                                                   self._ssh_identity_file,
                                                   self._ssh_str)

        self.executor = RayExecutor(settings,
                                    cpus_per_slot=self._num_cpus_per_slot,
                                    use_gpu=self._use_gpu,
                                    num_hosts=self._num_hosts,
                                    num_slots=self._num_slots)

        # We can't put `self` in the lambda closure, so we
        # resolve the variable ahead of time.
        logdir_ = str(self.logdir)

        # Starts the workers as specified by the resources above.
        self.executor.start(executable_cls=trainable,
                            executable_kwargs={
                                "config":
                                config,
                                "logger_creator":
                                lambda cfg: logger_creator(cfg, logdir_)
                            })

    def step(self) -> Dict:
        if self._finished:
            raise RuntimeError("Training has already finished.")
        result = self.executor.execute(lambda w: w.step())[0]
        if RESULT_DUPLICATE in result:
            self._finished = True
        return result

    def save_checkpoint(self, checkpoint_dir: str) -> str:
        # TODO: optimize if colocated
        save_obj = self.executor.execute_single(lambda w: w.save_to_object())
        checkpoint_path = TrainableUtil.create_from_pickle(
            save_obj, checkpoint_dir)
        return checkpoint_path

    def load_checkpoint(self, checkpoint_dir: str):
        checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir)
        x_id = ray.put(checkpoint_obj)
        return self.executor.execute(lambda w: w.restore_from_object(x_id))

    def stop(self):
        self.executor.execute(lambda w: w.stop())
        self.executor.shutdown()
Example #14
0
        for batch_idx, data in enumerate(train_loader):
            feature = data[:-1]
            target = data[-1]
            optimizer.zero_grad()
            output = model(*feature)
            loss = F.smooth_l1_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} \tLoss: {:.6f}'.format(
                    epoch, loss.item()))

    for epoch in range(1, args.epochs + 1):
        train(epoch)


if __name__ == '__main__':
    # connect to ray cluster
    import ray
    # ray.init(address='auto')
    ray.init()
    torch_ds, num_features = process_data()
    # Start horovod workers on Ray
    from horovod.ray import RayExecutor
    settings = RayExecutor.create_settings(500)
    executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1)
    executor.start()
    executor.run(train_fn, args=[torch_ds, num_features])
    raydp.stop_spark()
    ray.shutdown()
Example #15
0
        # TensorBoard or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            './checkpoint-{epoch}.h5'))

    # Horovod: write logs on worker 0.
    verbose = 1 if hvd.rank() == 0 else 0

    # Train the model.
    # Horovod: adjust number of steps based on number of GPUs.
    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(),
                    callbacks=callbacks, epochs=num_epochs, verbose=verbose)


ray.init()
settings = RayExecutor.create_settings(timeout_s=30)
executor = RayExecutor(settings, num_hosts=1, num_slots=2, use_gpu=False)
executor.start()
executor.run(train, kwargs=dict(num_epochs=1))
executor.shutdown()
Example #16
0
import socket
import ray

import horovod.tensorflow.keras as hvd
# import horovod.tensorflow as hvd
from horovod.ray import RayExecutor

# Start the Ray cluster or attach to an existing Ray cluster
ray.init(address="auto")

# Ray executor settings
setting = RayExecutor.create_settings(timeout_s=100)
num_hosts = 1  # number of machine to use
num_slots = 6  # number of workers to use on each machine
cpus_per_slot = 1  # number of cores to allocate to each worker
gpus_per_slot = 1  # number of GPUs to allocate to each worker

# Start num_hosts * num_slots actors on the cluster
# https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api
executor = RayExecutor(setting,
                       num_hosts=num_hosts,
                       num_slots=num_slots,
                       cpus_per_slot=cpus_per_slot,
                       gpus_per_slot=gpus_per_slot,
                       use_gpu=True)

# Launch the Ray actors on each machine
# This will launch `num_slots` actors on each machine
print("Start executor...", end="", flush=True)
executor.start()
print("OK", flush=True)
Example #17
0
class HorovodRayPlugin(HorovodPlugin):
    """Pytorch Lightning Plugin for Horovod training on a Ray cluster.

    This plugin is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this plugin, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray_lightning import HorovodRayPlugin

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            plugin = HorovodRayPlugin(num_hosts=2, num_slots=4,
                use_gpu=True)

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
            trainer.fit(ptl_model)

    """

    def __init__(self,
                 num_hosts: int = 1,
                 num_slots: int = 1,
                 use_gpu: bool = False):
        if not HOROVOD_AVAILABLE:
            raise RuntimeError("Please intall Horovod to use this plugin.")
        if not ray.is_initialized():
            ray.init()
        super().__init__()
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu
        self.executor = None

    def __getstate__(self):
        d = self.__dict__.copy()
        del d["executor"]
        return d

    def __setstate__(self, d):
        d["executor"] = None
        self.__dict__.update(d)

    def setup(self, model: LightningModule):
        """Creates the RayExecutor object."""
        self._model = model
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(
            settings,
            num_hosts=self.num_hosts,
            num_slots=self.num_slots,
            use_gpu=self.use_gpu)
        self.executor.start(executable_cls=get_executable_cls())

    def pre_dispatch(self):
        """All pre-dispatch logic should be done in train_remote instead."""
        pass

    def start_training(self, trainer):
        """Main training loop.

        Trigger remote training via ``train_remote`` on each
        worker. If using with Ray Tune, create a communication queue to
        retrieve intermediate results, and process those results. Finally
        retrieve the training results from the rank 0 worker and return."""
        model = self._model
        model_ref = ray.put(model)
        # Don't pickle the model when training remotely.
        self._model = None

        queue = None
        if TUNE_INSTALLED and is_session_enabled():
            # Create communication queue and send to all the workers.
            queue = Queue(actor_options={"num_cpus": 0})

        result_futures = self.executor.run_remote(
            self.train_remote, args=[model_ref, queue])

        results = process_results(result_futures, queue)

        results, state_dict, best_path = results[0]
        self._results = results
        self._model = model
        self._model.load_state_dict(state_dict)
        self._model.trainer.accelerator.training_type_plugin = self
        if self.lightning_module.trainer.checkpoint_callback:
            self.lightning_module.trainer.checkpoint_callback \
                .best_model_path = best_path

        if queue:
            # Shutdown the queue.
            queue.shutdown()

        return results

    def train_remote(self, model: ObjectRef, queue: Queue = None, **kwargs):
        """Training function to be executed on each remote worker."""
        self._model = ray.get(model)
        self.lightning_module.trainer.accelerator_connector\
            ._training_type_plugin = self
        self.lightning_module.trainer.accelerator.training_type_plugin = self

        hvd.init()
        self.global_rank = hvd.rank()
        self.local_rank = hvd.local_rank()
        self.world_size = hvd.size()
        rank_zero_only.rank = self.global_rank

        if queue is not None:
            # Initialize session.
            init_session(rank=self.global_rank, queue=queue)

        # Move the model to the appropriate device.
        super(HorovodRayPlugin, self).model_to_device()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayPlugin, self).pre_dispatch()
        results = super(HorovodRayPlugin,
                        self).start_training(self.lightning_module.trainer)
        if self.global_rank != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.lightning_module.trainer.checkpoint_callback is not None:
            best_model_path = \
                self.lightning_module.trainer.checkpoint_callback\
                    .best_model_path

        return results, self.lightning_module.state_dict(), best_model_path

    def post_dispatch(self):
        """Shuts down the RayExecutor."""
        self.executor.shutdown()

    @property
    def is_distributed(self):
        return True

    @property
    def root_device(self):
        if self.use_gpu and torch.cuda.is_available():
            return torch.device("cuda", hvd.local_rank())
        else:
            return torch.device("cpu")
Example #18
0
    start = time.time()

    splits = create_dataset(files,
                            num_workers=args.num_workers,
                            epochs=args.epochs,
                            num_windows=args.num_windows)

    if args.debug:
        tasks = [
            consume.options(num_gpus=1).remote(split,
                                               rank=idx,
                                               batch_size=args.batch_size)
            for idx, split in enumerate(splits)
        ]
        ray.get(tasks)
    else:
        print("Create Ray executor")
        settings = RayExecutor.create_settings(timeout_s=30)
        executor = RayExecutor(settings,
                               num_workers=args.num_workers,
                               use_gpu=True)
        executor.start()
        executor.run(train_main, args=[args, splits])
        executor.shutdown()

    delta = time.time() - start
    print(f"success! total time {delta}")
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps({"ingest_time": delta, "success": 1}))
Example #19
0
from horovod.ray import RayExecutor
import horovod.torch as hvd
import ray

# Start the Ray cluster or attach to an existing Ray cluster
ray.init(address='auto')

num_workers = 4

# Start num_hosts * num_slots actors on the cluster
settings = RayExecutor.create_settings(timeout_s=30)
executor = RayExecutor(settings, num_workers=num_workers, use_gpu=True)

# Launch the Ray actors on each machine
# This will launch `num_slots` actors on each machine
executor.start()


# Using the stateless `run` method, a function can take in any args or kwargs
def simple_fn():
    hvd.init()
    print("hvd rank", hvd.rank(), "hvd local rank", hvd.local_rank())
    return hvd.rank()


# Execute the function on all workers at once
result = executor.run(simple_fn)
# Check that the rank of all workers is unique
assert len(set(result)) == num_workers

executor.shutdown()
    ray.init(address=args.address)

    num_rows = args.num_rows
    num_files = args.num_files
    num_row_groups_per_file = args.num_row_groups_per_file
    max_row_group_skew = args.max_row_group_skew
    data_dir = args.data_dir
    print(f"Generating {num_rows} rows over {num_files} files, with "
          f"{num_row_groups_per_file} row groups per file and at most "
          f"{100 * max_row_group_skew:.1f}% row group skew.")
    filenames, num_bytes = generate_data(num_rows, num_files,
                                         num_row_groups_per_file,
                                         max_row_group_skew, data_dir)
    print(f"Generated {len(filenames)} files containing {num_rows} rows "
          f"with {num_row_groups_per_file} row groups per file, totalling "
          f"{human_readable_size(num_bytes)}.")

    print("Create Ray executor")
    num_workers = args.num_workers
    cpus_per_worker = args.cpus_per_worker
    settings = RayExecutor.create_settings(timeout_s=30)
    executor = RayExecutor(settings,
                           num_workers=num_workers,
                           use_gpu=True,
                           cpus_per_worker=cpus_per_worker)
    executor.start()
    executor.run(train_main, args=[args, filenames])
    executor.shutdown()

    print("Done consuming batches.")
Example #21
0
def main(num_workers, use_gpu, **kwargs):
    settings = RayExecutor.create_settings(timeout_s=30)
    executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
    executor.run(train_fn, kwargs=kwargs)