def _ray_start_cluster(**kwargs): init_kwargs = get_default_fixture_ray_kwargs() num_nodes = 0 do_init = False # num_nodes & do_init are not arguments for ray.init, so delete them. if "num_nodes" in kwargs: num_nodes = kwargs["num_nodes"] del kwargs["num_nodes"] if "do_init" in kwargs: do_init = kwargs["do_init"] del kwargs["do_init"] elif num_nodes > 0: do_init = True init_kwargs.update(kwargs) cluster = Cluster() remote_nodes = [] for i in range(num_nodes): if i > 0 and "_system_config" in init_kwargs: del init_kwargs["_system_config"] remote_nodes.append(cluster.add_node(**init_kwargs)) # We assume driver will connect to the head (first node), # so ray init will be invoked if do_init is true if len(remote_nodes) == 1 and do_init: ray.init(address=cluster.address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_http_head_only(): cluster = Cluster() head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_options={ "port": new_port(), "location": "HeadOnly" }) # Only the controller and head node actor should be started assert len(ray.actors()) == 2 # They should all be placed on the head node cpu_per_nodes = { r["CPU"] for r in ray.state.state._available_resources_per_node().values() } assert cpu_per_nodes == {2, 4} client.shutdown() ray.shutdown() cluster.shutdown()
def test_detached_deployment(): # https://github.com/ray-project/ray/issues/11437 cluster = Cluster() head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=6) # Create first job, check we can run a simple serve endpoint ray.init(head_node.address) first_job_id = ray.get_runtime_context().job_id client = serve.start(detached=True) client.create_backend("f", lambda _: "hello") client.create_endpoint("f", backend="f") assert ray.get(client.get_handle("f").remote()) == "hello" ray.shutdown() # Create the second job, make sure we can still create new backends. ray.init(head_node.address) assert ray.get_runtime_context().job_id != first_job_id client = serve.connect() client.create_backend("g", lambda _: "world") client.create_endpoint("g", backend="g") assert ray.get(client.get_handle("g").remote()) == "world" # Test passed, clean up. client.shutdown() ray.shutdown() cluster.shutdown()
def main(): cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16 }) cluster.add_node(object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16) object_ref_list = [] for i in range(0, 10): object_ref = ray.put(np.random.rand(1024 * 128, 1024)) object_ref_list.append(object_ref) @ray.remote(num_gpus=1) def f(object_ref_list): diffs = [] for object_ref in object_ref_list: before = time.time() ray.get(object_ref) after = time.time() diffs.append(after - before) time.sleep(1) return np.mean(diffs), np.std(diffs) time_diff, time_diff_std = ray.get(f.remote(object_ref_list)) print("latency to get an 1G object over network", round(time_diff, 2), "+-", round(time_diff_std, 2)) ray.shutdown() cluster.shutdown()
def ray_4_node_gpu(): cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2, num_gpus=2) ray.init(address=cluster.address) yield ray.shutdown() cluster.shutdown()
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(address=cluster.address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_4_node_gpu(): cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2, num_gpus=2) ray.init(address=cluster.address) yield ray.shutdown() cluster.shutdown() # Ensure that tests don't ALL fail if dist.is_initialized(): dist.destroy_process_group()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster(initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**8 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(address=cluster.address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_cluster(): cluster = Cluster() yield Cluster() serve.shutdown() ray.shutdown() cluster.shutdown()
class TrialRunnerPlacementGroupTest(unittest.TestCase): def setUp(self): os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "10000" self.head_cpus = 8 self.head_gpus = 4 self.head_custom = 16 self.cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "include_dashboard": False, "num_cpus": self.head_cpus, "num_gpus": self.head_gpus, "resources": { "custom": self.head_custom }, "_system_config": { "num_heartbeats_timeout": 10 } }) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def _assertCleanup(self, trial_executor): # Assert proper cleanup pg_manager = trial_executor._pg_manager self.assertFalse(pg_manager._in_use_trials) self.assertFalse(pg_manager._in_use_pgs) self.assertFalse(pg_manager._staging_futures) for pgf in pg_manager._staging: self.assertFalse(pg_manager._staging[pgf]) for pgf in pg_manager._ready: self.assertFalse(pg_manager._ready[pgf]) self.assertTrue(pg_manager._latest_staging_start_time) num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) self.assertEqual(num_non_removed_pgs, 0) def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10): """In this test we try to start 10 trials but only have resources for 2. Placement groups should still be created and PENDING. Eventually they should be scheduled sequentially (i.e. in pairs of two).""" def train(config): time.sleep(1) now = time.time() tune.report(end=now - config["start_time"]) head_bundle = {"CPU": 4, "GPU": 0, "custom": 0} child_bundle = {"custom": 1} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle]) trial_executor = RayTrialExecutor(reuse_actors=reuse_actors) this = self class _TestCallback(Callback): def on_step_end(self, iteration, trials, **info): num_finished = len([ t for t in trials if t.status == Trial.TERMINATED or t.status == Trial.ERROR ]) num_staging = sum( len(s) for s in trial_executor._pg_manager._staging.values()) num_ready = sum( len(s) for s in trial_executor._pg_manager._ready.values()) num_in_use = len(trial_executor._pg_manager._in_use_pgs) num_cached = len(trial_executor._pg_manager._cached_pgs) total_num_tracked = num_staging + num_ready + \ num_in_use + num_cached num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) num_removal_scheduled_pgs = len( trial_executor._pg_manager._pgs_for_removal) # All trials should be scheduled this.assertEqual( scheduled, min(scheduled, len(trials)), msg=f"Num trials iter {iteration}") # The number of PGs should decrease when trials finish this.assertEqual( max(scheduled, len(trials)) - num_finished, total_num_tracked, msg=f"Num tracked iter {iteration}") # The number of actual placement groups should match this this.assertEqual( max(scheduled, len(trials)) - num_finished, num_non_removed_pgs - num_removal_scheduled_pgs, msg=f"Num actual iter {iteration}") start = time.time() out = tune.run( train, config={"start_time": start}, resources_per_trial=placement_group_factory, num_samples=10, trial_executor=trial_executor, callbacks=[_TestCallback()], reuse_actors=reuse_actors, verbose=2) trial_end_times = sorted(t.last_result["end"] for t in out.trials) print("Trial end times:", trial_end_times) max_diff = trial_end_times[-1] - trial_end_times[0] # Not all trials have been run in parallel self.assertGreater(max_diff, 3) # Some trials should have run in parallel # Todo: Re-enable when using buildkite # self.assertLess(max_diff, 10) self._assertCleanup(trial_executor) def testPlacementGroupRequestsWithActorReuse(self): """Assert that reuse actors doesn't leak placement groups""" self.testPlacementGroupRequests(reuse_actors=True) @patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6) @patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6) def testPlacementGroupLimitedRequests(self): """Assert that maximum number of placement groups is enforced.""" self.testPlacementGroupRequests(scheduled=6) @patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6) @patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6) def testPlacementGroupLimitedRequestsWithActorReuse(self): self.testPlacementGroupRequests(reuse_actors=True, scheduled=6) def testPlacementGroupDistributedTraining(self, reuse_actors=False): """Run distributed training using placement groups. Each trial requests 4 CPUs and starts 4 remote training workers. """ head_bundle = {"CPU": 1, "GPU": 0, "custom": 0} child_bundle = {"CPU": 1} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle, child_bundle]) @ray.remote class TrainingActor: def train(self, val): time.sleep(1) return val def train(config): base = config["base"] actors = [TrainingActor.remote() for _ in range(4)] futures = [ actor.train.remote(base + 2 * i) for i, actor in enumerate(actors) ] results = ray.get(futures) end = time.time() - config["start_time"] tune.report(avg=np.mean(results), end=end) trial_executor = RayTrialExecutor(reuse_actors=reuse_actors) start = time.time() out = tune.run( train, config={ "start_time": start, "base": tune.grid_search(list(range(0, 100, 10))) }, resources_per_trial=placement_group_factory, num_samples=1, trial_executor=trial_executor, reuse_actors=reuse_actors, verbose=2) avgs = sorted(t.last_result["avg"] for t in out.trials) self.assertSequenceEqual(avgs, list(range(3, 103, 10))) trial_end_times = sorted(t.last_result["end"] for t in out.trials) print("Trial end times:", trial_end_times) max_diff = trial_end_times[-1] - trial_end_times[0] # Not all trials have been run in parallel self.assertGreater(max_diff, 3) # Some trials should have run in parallel # Todo: Re-enable when using buildkite # self.assertLess(max_diff, 10) self._assertCleanup(trial_executor) def testPlacementGroupDistributedTrainingWithActorReuse(self): self.testPlacementGroupDistributedTraining(reuse_actors=True)
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
class RayExecutorPlacementGroupTest(unittest.TestCase): def setUp(self): self.head_cpus = 8 self.head_gpus = 4 self.head_custom = 16 self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": self.head_cpus, "num_gpus": self.head_gpus, "resources": { "custom": self.head_custom }, "_system_config": { "num_heartbeats_timeout": 10 } }) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testResourcesAvailableNoPlacementGroup(self): def train(config): tune.report(metric=0, resources=ray.available_resources()) out = tune.run(train, resources_per_trial={ "cpu": 1, "gpu": 1, "custom_resources": { "custom": 3 }, "extra_cpu": 3, "extra_gpu": 1, "extra_custom_resources": { "custom": 4 }, }) # Only `cpu`, `gpu`, and `custom_resources` will be "really" reserved, # the extra_* will just be internally reserved by Tune. self.assertDictEqual( { key: val for key, val in out.trials[0].last_result["resources"].items() if key in ["CPU", "GPU", "custom"] }, { "CPU": self.head_cpus - 1.0, "GPU": self.head_gpus - 1.0, "custom": self.head_custom - 3.0 }) def testResourcesAvailableWithPlacementGroup(self): def train(config): tune.report(metric=0, resources=ray.available_resources()) head_bundle = {"CPU": 1, "GPU": 0, "custom": 4} child_bundle = {"CPU": 2, "GPU": 1, "custom": 3} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle]) out = tune.run(train, resources_per_trial=placement_group_factory) available = { key: val for key, val in out.trials[0].last_result["resources"].items() if key in ["CPU", "GPU", "custom"] } if not available: self.skipTest(f"Warning: Ray reported no available resources, " f"but this is an error on the Ray core side. " f"Skipping this test for now.") self.assertDictEqual( available, { "CPU": self.head_cpus - 5.0, "GPU": self.head_gpus - 2.0, "custom": self.head_custom - 10.0 }) def testPlacementGroupFactoryEquality(self): """ Test that two different placement group factory objects are considered equal and evaluate to the same hash. """ from collections import Counter pgf_1 = PlacementGroupFactory([{ "CPU": 2, "GPU": 4, "custom": 7 }, { "GPU": 2, "custom": 1, "CPU": 3 }], "PACK", "no_name", None) pgf_2 = PlacementGroupFactory([{ "custom": 7, "GPU": 4, "CPU": 2, }, { "custom": 1, "GPU": 2, "CPU": 3 }], strategy="PACK", name="no_name", lifetime=None) self.assertEqual(pgf_1, pgf_2) # Hash testing counter = Counter() counter[pgf_1] += 1 counter[pgf_2] += 1 self.assertEqual(counter[pgf_1], 2) self.assertEqual(counter[pgf_2], 2)
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()