def test_local(ray_start_4_cpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_hosts=1, num_slots=4) hjob.start() hostnames = hjob.execute(lambda _: socket.gethostname()) assert len(set(hostnames)) == 1, hostnames hjob.shutdown() assert check_resources(original_resources)
def test_local(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host): setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=num_workers, num_hosts=num_hosts, num_workers_per_host=num_workers_per_host) hjob.start() hostnames = hjob.execute(lambda _: socket.gethostname()) assert len(set(hostnames)) == 1, hostnames hjob.shutdown()
def test_gpu_ids_num_workers(ray_start_4_cpus_4_gpus): setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=4, use_gpu=True) hjob.start() all_envs = hjob.execute(lambda _: os.environ.copy()) all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs} assert len(all_cudas) == 1, all_cudas assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split( ",")) == 4, all_envs[0]["CUDA_VISIBLE_DEVICES"] def _test(worker): import horovod.torch as hvd hvd.init() local_rank = str(hvd.local_rank()) return local_rank in os.environ["CUDA_VISIBLE_DEVICES"] all_valid_local_rank = hjob.execute(_test) assert all(all_valid_local_rank) hjob.shutdown()
def test_gpu_ids(ray_start_4_cpus_4_gpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_workers_per_host=4, use_gpu=True) hjob.start() all_envs = hjob.execute(lambda _: os.environ.copy()) all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs} assert len(all_cudas) == 1, all_cudas assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(",")) == 4 hjob.shutdown() assert check_resources(original_resources)
def test_horovod_train(ray_start_4_cpus): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2, 3} hjob.shutdown()
def test_train(ray_start_4_cpus): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_workers=3, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2} result = ray.get(hjob.run_remote(simple_fn, args=[None])) assert set(result) == {0, 1, 2} hjob.shutdown()
def test_ray_init(ray_start_4_cpus): original_resources = ray.available_resources() def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert len(set(result)) == 4 hjob.shutdown() assert check_resources(original_resources)
def test_ray_init(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host): def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=num_workers, num_hosts=num_hosts, num_workers_per_host=num_workers_per_host, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert len(set(result)) == 4 hjob.shutdown()
def test_ray_executable(ray_start_4_cpus): class Executable: def __init__(self, epochs): import horovod.torch as hvd self.hvd = hvd self.epochs = epochs self.hvd.init() def rank_epoch(self): return self.hvd.rank() * self.epochs setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start(executable_cls=Executable, executable_args=[2]) result = hjob.execute(lambda w: w.rank_epoch()) assert set(result) == {0, 2, 4, 6} hjob.shutdown()
def run(self): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=4, num_hosts=None, num_workers_per_host=None, cpus_per_worker=1, gpus_per_worker=int(torch.cuda.is_available()) or None, use_gpu=torch.cuda.is_available()) hjob.start() assert not hjob.adapter.strategy._created_placement_group result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2, 3} hjob.shutdown()