def test_drivers_release_resources(ray_start_head_with_resources): redis_address = ray_start_head_with_resources # Define a driver that creates an actor and exits. driver_script1 = """ import time import ray ray.init(redis_address="{}") @ray.remote def f(duration): time.sleep(duration) @ray.remote(num_gpus=1) def g(duration): time.sleep(duration) @ray.remote(num_gpus=1) class Foo(object): def __init__(self): pass # Make sure some resources are available for us to run tasks. ray.get(f.remote(0)) ray.get(g.remote(0)) # Start a bunch of actors and tasks that use resources. These should all be # cleaned up when this driver exits. foos = [Foo.remote() for _ in range(100)] [f.remote(10 ** 6) for _ in range(100)] print("success") """.format(redis_address) driver_script2 = (driver_script1 + "import sys\nsys.stdout.flush()\ntime.sleep(10 ** 6)\n") def wait_for_success_output(process_handle, timeout=10): # Wait until the process prints "success" and then return. start_time = time.time() while time.time() - start_time < timeout: output_line = ray.utils.decode( process_handle.stdout.readline()).strip() print(output_line) if output_line == "success": return raise Exception("Timed out waiting for process to print success.") # Make sure we can run this driver repeatedly, which means that resources # are getting released in between. for _ in range(5): out = run_string_as_driver(driver_script1) # Make sure the first driver ran to completion. assert "success" in out # Also make sure that this works when the driver exits ungracefully. process_handle = run_string_as_driver_nonblocking(driver_script2) wait_for_success_output(process_handle) # Kill the process ungracefully. process_handle.kill()
def test_infeasible_tasks(ray_start_empty_cluster): cluster = ray_start_empty_cluster @ray.remote def f(): return cluster.add_node(resources={str(0): 100}) ray.init(redis_address=cluster.redis_address) # Submit an infeasible task. x_id = f._submit(args=[], kwargs={}, resources={str(1): 1}) # Add a node that makes the task feasible and make sure we can get the # result. cluster.add_node(resources={str(1): 100}) ray.get(x_id) # Start a driver that submits an infeasible task and then let it exit. driver_script = """ import ray ray.init(redis_address="{}") @ray.remote(resources={}) def f(): {}pass # This is a weird hack to insert some blank space. f.remote() """.format(cluster.redis_address, "{str(2): 1}", " ") run_string_as_driver(driver_script) # Now add a new node that makes the task feasible. cluster.add_node(resources={str(2): 100}) # Make sure we can still run tasks on all nodes. ray.get([ f._submit(args=[], kwargs={}, resources={str(i): 1}) for i in range(3) ])
def test_driver_exiting_quickly(ray_start_head): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. redis_address = ray_start_head ray.init(redis_address=redis_address) # Define a driver that creates an actor and exits. driver_script1 = """ import ray ray.init(redis_address="{}") @ray.remote class Foo(object): def __init__(self): pass Foo.remote() print("success") """.format(redis_address) # Define a driver that creates some tasks and exits. driver_script2 = """ import ray ray.init(redis_address="{}") @ray.remote def f(): return 1 f.remote() print("success") """.format(redis_address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script1) # Make sure the first driver ran to completion. assert "success" in out out = run_string_as_driver(driver_script2) # Make sure the first driver ran to completion. assert "success" in out assert ray.services.all_processes_alive()
def test_connecting_in_local_case(ray_start_regular): address_info = ray_start_regular # Define a driver that just connects to Redis. driver_script = """ import ray ray.init(redis_address="{}") print("success") """.format(address_info["redis_address"]) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out
def test_run_driver_twice(ray_start_regular): # We used to have issue 2165 and 2288: # https://github.com/ray-project/ray/issues/2165 # https://github.com/ray-project/ray/issues/2288 # both complain that driver will hang when run for the second time. # This test is used to verify the fix for above issue, it will run the # same driver for twice and verify whether both of them succeed. address_info = ray_start_regular driver_script = """ import ray import ray.tune as tune import os import time def train_func(config, reporter): # add a reporter arg for i in range(2): time.sleep(0.1) reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" ray.init(redis_address="{}") ray.tune.register_trainable("train_func", train_func) tune.run_experiments({{ "my_experiment": {{ "run": "train_func", "stop": {{"mean_accuracy": 99}}, "config": {{ "layer1": {{ "class_name": tune.grid_search(["a"]), "config": {{"lr": tune.grid_search([1, 2])}} }}, }}, "local_dir": os.path.expanduser("~/tmp") }} }}) print("success") """.format(address_info["redis_address"]) for i in range(2): out = run_string_as_driver(driver_script) assert "success" in out
def test_remote_function_isolation(ray_start_head): # This test will run multiple remote functions with the same names in # two different drivers. Connect a driver to the Ray cluster. redis_address = ray_start_head ray.init(redis_address=redis_address) # Start another driver and make sure that it can define and call its # own commands with the same names. driver_script = """ import ray import time ray.init(redis_address="{}") @ray.remote def f(): return 3 @ray.remote def g(x, y): return 4 for _ in range(10000): result = ray.get([f.remote(), g.remote(0, 0)]) assert result == [3, 4] print("success") """.format(redis_address) out = run_string_as_driver(driver_script) @ray.remote def f(): return 1 @ray.remote def g(x): return 2 for _ in range(10000): result = ray.get([f.remote(), g.remote(0)]) assert result == [1, 2] # Make sure the other driver succeeded. assert "success" in out
def test_driver_exiting_when_worker_blocked(ray_start_head): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. redis_address = ray_start_head ray.init(redis_address=redis_address) # Define a driver that creates an actor and exits. driver_script = """ import time import ray ray.init(redis_address="{}") @ray.remote def f(): time.sleep(10**6) @ray.remote def g(): ray.get(f.remote()) g.remote() time.sleep(1) print("success") """.format(redis_address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Make sure the first driver ran to completion. assert "success" in out @ray.remote def f(): return 1 # Make sure we can still talk with the raylet. ray.get(f.remote())
def test_error_isolation(ray_start_head): redis_address = ray_start_head # Connect a driver to the Ray cluster. ray.init(redis_address=redis_address) # There shouldn't be any errors yet. assert len(ray.error_info()) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. while len(ray.error_info()) != 1: time.sleep(0.1) print("Waiting for error to appear.") # Make sure we got the error. assert len(ray.error_info()) == 1 assert error_string1 in ray.error_info()[0]["message"] # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time ray.init(redis_address="{}") time.sleep(1) assert len(ray.error_info()) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass while len(ray.error_info()) != 1: print(len(ray.error_info())) time.sleep(0.1) assert len(ray.error_info()) == 1 assert "{}" in ray.error_info()[0]["message"] print("success") """.format(redis_address, error_string2, error_string2) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. assert len(ray.error_info()) == 1 assert error_string1 in ray.error_info()[0]["message"]