def test_gpu_e2e(ray_8_cpus_gpus): with fault_tolerance_patches(): discovery_schedule = [ (20, ['host-1:1']), (60, ['host-1:1', 'host-4:1', 'host-5:1']), (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = ElasticRayExecutor.create_settings(min_np=4, max_np=4, nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = ElasticRayExecutor(settings, gpus_per_slot=1, use_gpu=True, override_discovery=False) logger, training_fn = _create_training_function(iterations=100) executor.start() results = executor.run(training_fn) assert len(results) == 4 events = ray.get(logger.fetch.remote()) assert sum(int("started" in e) for e in events) == 4, events assert sum(int("finished" in e) for e in events) == 4, events
def test_min_np(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:1']), (10, ['host-1:1', 'host-4:1', 'host-5:1']), (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = ElasticRayExecutor.create_settings(min_np=4, max_np=4, nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = ElasticRayExecutor(settings, cpus_per_slot=1, override_discovery=False) training_fn = _create_training_function(iterations=30) executor.start() trace = StatusCallback() results = executor.run(training_fn, callbacks=[trace]) assert len(results) == 4 events = trace.fetch() assert sum(int("started" in e) for e in events) == 4, events assert sum(int("finished" in e) for e in events) == 4, events