def test_pull_request_retry(shutdown_only): cluster = Cluster() cluster.add_node(num_cpus=0, num_gpus=1, object_store_memory=100 * 2**20) cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 2**20) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def put(): return np.zeros(64 * 2**20, dtype=np.int8) @ray.remote(num_cpus=0, num_gpus=1) def driver(): local_ref = ray.put(np.zeros(64 * 2**20, dtype=np.int8)) remote_ref = put.remote() ready, _ = ray.wait([remote_ref], timeout=1) assert len(ready) == 0 del local_ref # This should always complete within 10 seconds. ready, _ = ray.wait([remote_ref], timeout=20) assert len(ready) > 0 # Pretend the GPU node is the driver. We do this to force the placement of # the driver and `put` task on different nodes. ray.get(driver.remote())
def test_http_head_only(): cluster = Cluster() head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_options={ "port": new_port(), "location": "HeadOnly" }) # Only the controller and head node actor should be started assert len(ray.actors()) == 2 # They should all be placed on the head node cpu_per_nodes = { r["CPU"] for r in ray.state.state._available_resources_per_node().values() } assert cpu_per_nodes == {2, 4} client.shutdown() ray.shutdown() cluster.shutdown()
def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, "raylet_heartbeat_period_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) p = init_error_pubsub() errors = get_error_message(p, 1, timeout=5) assert len(errors) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0) cluster.remove_node(removing_node, allow_graceful=True) errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 # There is no connection error to a dead node. errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 p.close()
def main(): cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16 }) cluster.add_node(object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16) object_ref_list = [] for i in range(0, 10): object_ref = ray.put(np.random.rand(1024 * 128, 1024)) object_ref_list.append(object_ref) @ray.remote(num_gpus=1) def f(object_ref_list): diffs = [] for object_ref in object_ref_list: before = time.time() ray.get(object_ref) after = time.time() diffs.append(after - before) time.sleep(1) return np.mean(diffs), np.std(diffs) time_diff, time_diff_std = ray.get(f.remote(object_ref_list)) print("latency to get an 1G object over network", round(time_diff, 2), "+-", round(time_diff_std, 2)) ray.shutdown() cluster.shutdown()
def test_pull_bundles_admission_control(shutdown_only): cluster = Cluster() object_size = int(6e6) num_objects = 10 num_tasks = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can only fit 1 task at a time. cluster.add_node(num_cpus=1, object_store_memory=1.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(*args): return args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) tasks = [foo.remote(*task_args) for task_args in args] ray.get(tasks)
def build_cluster(num_nodes, num_cpus, object_store_memory): cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpus, object_store_memory=object_store_memory) cluster.wait_for_nodes() return cluster
def test_multi_node_stats(shutdown_only): cluster = Cluster() for _ in range(2): cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote(num_cpus=1) class Actor: def __init__(self): self.ref = ray.put(np.zeros(100000)) def ping(self): pass # Each actor will be on a different node. a = Actor.remote() b = Actor.remote() ray.get(a.ping.remote()) ray.get(b.ping.remote()) # Verify we have collected stats across the nodes. info = memory_summary(cluster.address) print(info) assert count(info, PUT_OBJ) == 2, info
def create_cluster(num_nodes): cluster = Cluster() for i in range(num_nodes): cluster.add_node(resources={str(i): 100}, object_store_memory=10**9) ray.init(address=cluster.address) return cluster
def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.remaining_processes_alive() assert node2.remaining_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2])
def ray_4_node_gpu(): cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2, num_gpus=2) ray.init(address=cluster.address) yield ray.shutdown() cluster.shutdown()
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(address=cluster.address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_redis_password_cluster(self, password, shutdown_only): @ray.remote def f(): return 1 node_args = {"redis_password": password} cluster = Cluster(initialize_head=True, connect=True, head_node_args=node_args) cluster.add_node(**node_args) object_ref = f.remote() ray.get(object_ref)
def ray_4_node_gpu(): cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2, num_gpus=2) ray.init(address=cluster.address) yield ray.shutdown() cluster.shutdown() # Ensure that tests don't ALL fail if dist.is_initialized(): dist.destroy_process_group()
def test_detached_deployment(): # https://github.com/ray-project/ray/issues/11437 cluster = Cluster() head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=6) # Create first job, check we can run a simple serve endpoint ray.init(head_node.address) first_job_id = ray.get_runtime_context().job_id client = serve.start(detached=True) client.create_backend("f", lambda _: "hello") client.create_endpoint("f", backend="f") assert ray.get(client.get_handle("f").remote()) == "hello" ray.shutdown() # Create the second job, make sure we can still create new backends. ray.init(head_node.address) assert ray.get_runtime_context().job_id != first_job_id client = serve.connect() client.create_backend("g", lambda _: "world") client.create_endpoint("g", backend="g") assert ray.get(client.get_handle("g").remote()) == "world" # Test passed, clean up. client.shutdown() ray.shutdown() cluster.shutdown()
def _ray_start_cluster(**kwargs): init_kwargs = get_default_fixture_ray_kwargs() num_nodes = 0 do_init = False # num_nodes & do_init are not arguments for ray.init, so delete them. if "num_nodes" in kwargs: num_nodes = kwargs["num_nodes"] del kwargs["num_nodes"] if "do_init" in kwargs: do_init = kwargs["do_init"] del kwargs["do_init"] elif num_nodes > 0: do_init = True init_kwargs.update(kwargs) cluster = Cluster() remote_nodes = [] for i in range(num_nodes): if i > 0 and "_system_config" in init_kwargs: del init_kwargs["_system_config"] remote_nodes.append(cluster.add_node(**init_kwargs)) # We assume driver will connect to the head (first node), # so ray init will be invoked if do_init is true if len(remote_nodes) == 1 and do_init: ray.init(address=cluster.address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster(initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**8 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(address=cluster.address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that # the object store's capacity starts off higher and is later consumed # dynamically by concurrent workers. cluster = Cluster() object_size = int(6e6) num_objects = 10 num_tasks = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can fit 2 tasks at a time. cluster.add_node(num_cpus=1, object_store_memory=2.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(i, *args): print("foo", i) return @ray.remote def allocate(i): print("allocate", i) return np.zeros(object_size, dtype=np.uint8) args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] allocated = [allocate.remote(i) for i in range(num_objects)] ray.get(tasks) del allocated
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.torch: exp["config"]["framework"] = "torch" elif args.eager: exp["config"]["framework"] = "tfe" if args.trace: if exp["config"]["framework"] not in ["tf2", "tfe"]: raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 3 # Print details on trial result if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 # Print details on trial result if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory) ray.init(address=cluster.address) else: ray.init(include_dashboard=not args.no_ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode) if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1) else: progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1) run_experiments(experiments, scheduler=create_scheduler(args.scheduler, **args.scheduler_config), resume=args.resume, queue_trials=args.queue_trials, verbose=verbose, progress_reporter=progress_reporter, concurrent=True) ray.shutdown()
def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, } cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the parent actor. node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) # Node to place the child actor. cluster.add_node(num_cpus=1, resources={"child": 1}) cluster.wait_for_nodes() @ray.remote def sleep(): time.sleep(1000) @ray.remote(resources={"child": 1}) def probe(): return # TODO(swang): This test does not pass if max_restarts > 0 for the # raylet codepath. Add this parameter once the GCS actor service is enabled # by default. @ray.remote class Actor(object): def __init__(self): return def start_child(self, use_actors): if use_actors: child = Actor.options(resources={"child": 1}).remote() ray.get(child.sleep.remote()) else: ray.get(sleep.options(resources={"child": 1}).remote()) def sleep(self): time.sleep(1000) def get_pid(self): return os.getpid() # Returns whether the "child" resource is available. def child_resource_available(): p = probe.remote() ready, _ = ray.wait([p], timeout=1) return len(ready) > 0 # Test fate sharing if the parent process dies. def test_process_failure(use_actors): a = Actor.options(resources={"parent": 1}).remote() pid = ray.get(a.get_pid.remote()) a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. os.kill(pid, 9) wait_for_condition(child_resource_available) # Test fate sharing if the parent node dies. def test_node_failure(node_to_kill, use_actors): a = Actor.options(resources={"parent": 1}).remote() a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) wait_for_condition(child_resource_available) return node_to_kill if node_failure: test_node_failure(node_to_kill, use_actors) else: test_process_failure(use_actors) ray.state.state._check_connected() keys = [ key for r in ray.state.state.redis_clients for key in r.keys("WORKER_FAILURE*") ] if node_failure: assert len(keys) <= 1, len(keys) else: assert len(keys) <= 2, len(keys)
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))