def test_both_num_workers_min_workers(ray_8_cpus): settings = RayExecutor.create_settings() with pytest.raises(ValueError, match=r"Both `min_workers` and `num_workers` provided."): executor = RayExecutor( settings, min_workers=1, num_workers=1, cpus_per_worker=1)
def test_local(ray_start_4_cpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_hosts=1, num_slots=4) hjob.start() hostnames = hjob.execute(lambda _: socket.gethostname()) assert len(set(hostnames)) == 1, hostnames hjob.shutdown() assert check_resources(original_resources)
def test_local(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host): setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=num_workers, num_hosts=num_hosts, num_workers_per_host=num_workers_per_host) hjob.start() hostnames = hjob.execute(lambda _: socket.gethostname()) assert len(set(hostnames)) == 1, hostnames hjob.shutdown()
def test_train(ray_start_4_cpus): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_workers=3, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2} result = ray.get(hjob.run_remote(simple_fn, args=[None])) assert set(result) == {0, 1, 2} hjob.shutdown()
def test_infeasible_placement(ray_start_2_cpus): setting = RayExecutor.create_settings(timeout_s=30, placement_group_timeout_s=5) hjob = RayExecutor(setting, num_hosts=1, num_slots=4) with pytest.raises(TimeoutError): hjob.start() hjob.shutdown()
def test_gpu_ids(ray_start_4_cpus_4_gpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_workers_per_host=4, use_gpu=True) hjob.start() all_envs = hjob.execute(lambda _: os.environ.copy()) all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs} assert len(all_cudas) == 1, all_cudas assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(",")) == 4 hjob.shutdown() assert check_resources(original_resources)
def test_horovod_train(ray_start_4_cpus): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2, 3} hjob.shutdown()
def test_ray_exec_func(ray_start_4_cpus): def simple_fn(num_epochs): import horovod.torch as hvd hvd.init() return hvd.rank() * num_epochs setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.run(simple_fn, args=[0]) assert len(set(result)) == 1 hjob.shutdown()
def test_min_num_proc(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:1']), (10, ['host-1:1', 'host-4:1', 'host-5:1']), (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = RayExecutor.create_settings(nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = RayExecutor(settings, min_workers=4, max_workers=4, override_discovery=False ) training_fn = _create_training_function(iterations=30) executor.start() trace = StatusCallback() results = executor.run(training_fn, callbacks=[trace]) assert len(results) == 4 events = trace.fetch() assert sum(int("started" in e) for e in events) == 4, events assert sum(int("finished" in e) for e in events) == 4, events
def test_fault_tolerance_hosts_remove_and_add_cooldown(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:2', 'host-2:1', 'host-3:2']), (10, ['host-1:2']), (None, ['host-1:2', 'host-2:1', 'host-3:2']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = RayExecutor.create_settings(nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = RayExecutor(settings, min_workers=1, cpus_per_worker=1, override_discovery=False, cooldown_range=[1, 1]) training_fn = _create_training_function(iterations=30) executor.start() trace = StatusCallback() results = executor.run(training_fn, callbacks=[trace]) assert len(results) == 5 events = trace.fetch() assert sum(int("started" in e) for e in events) == 5, events assert sum(int("finished" in e) for e in events) == 5, events
def test_gpu_ids(ray_start_4_cpus_4_gpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_hosts=1, num_slots=4, use_gpu=True) hjob.start() worker_handles = hjob.workers all_envs = ray.get([h.env_vars.remote() for h in worker_handles]) all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs} assert len(all_cudas) == 1, all_cudas assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(",")) == 4 hjob.shutdown() assert check_resources(original_resources)
def test_ray_init(ray_start_4_cpus): original_resources = ray.available_resources() def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert len(set(result)) == 4 hjob.shutdown() assert check_resources(original_resources)
def test_gpu_ids_num_workers(ray_start_4_cpus_4_gpus): setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=4, use_gpu=True) hjob.start() all_envs = hjob.execute(lambda _: os.environ.copy()) all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs} assert len(all_cudas) == 1, all_cudas assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split( ",")) == 4, all_envs[0]["CUDA_VISIBLE_DEVICES"] def _test(worker): import horovod.torch as hvd hvd.init() local_rank = str(hvd.local_rank()) return local_rank in os.environ["CUDA_VISIBLE_DEVICES"] all_valid_local_rank = hjob.execute(_test) assert all(all_valid_local_rank) hjob.shutdown()
def test_ray_init(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host): def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=num_workers, num_hosts=num_hosts, num_workers_per_host=num_workers_per_host, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert len(set(result)) == 4 hjob.shutdown()
def test_ray_executable(ray_start_4_cpus): class Executable: def __init__(self, epochs): import horovod.torch as hvd self.hvd = hvd self.epochs = epochs self.hvd.init() def rank_epoch(self): return self.hvd.rank() * self.epochs setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start(executable_cls=Executable, executable_args=[2]) result = hjob.execute(lambda w: w.rank_epoch()) assert set(result) == {0, 2, 4, 6} hjob.shutdown()
def test_ray_exec_remote_func(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host): def simple_fn(num_epochs): import horovod.torch as hvd hvd.init() return hvd.rank() * num_epochs setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=num_workers, num_hosts=num_hosts, num_workers_per_host=num_workers_per_host, use_gpu=torch.cuda.is_available()) hjob.start() object_refs = hjob.run_remote(simple_fn, args=[0]) result = ray.get(object_refs) assert len(set(result)) == 1 hjob.shutdown()
def run(self): def simple_fn(worker): local_rank = _train() return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_workers=4, num_hosts=None, num_workers_per_host=None, cpus_per_worker=1, gpus_per_worker=int(torch.cuda.is_available()) or None, use_gpu=torch.cuda.is_available()) hjob.start() assert not hjob.adapter.strategy._created_placement_group result = hjob.execute(simple_fn) assert set(result) == {0, 1, 2, 3} hjob.shutdown()