def test_ray_options(shutdown_only): ray.init(num_cpus=10, num_gpus=10, resources={"custom1": 2}) @ray.remote(num_cpus=2, num_gpus=3, memory=150 * 2**20, resources={"custom1": 1}) def foo(): import time # Sleep for a heartbeat period to ensure resources changing reported. time.sleep(0.1) return ray.available_resources() without_options = ray.get(foo.remote()) with_options = ray.get( foo.options(num_cpus=3, num_gpus=4, memory=50 * 2**20, resources={ "custom1": 0.5 }).remote()) to_check = ["CPU", "GPU", "memory", "custom1"] for key in to_check: assert without_options[key] != with_options[key], key assert without_options != with_options
def test_internal_free(shutdown_only): ray.init(num_cpus=1) @ray.remote class Sampler: def sample(self): return [1, 2, 3, 4, 5] def sample_big(self): return np.zeros(1024 * 1024) sampler = Sampler.remote() # Free deletes from in-memory store. obj_ref = sampler.sample.remote() ray.get(obj_ref) ray.internal.free(obj_ref) with pytest.raises(Exception): ray.get(obj_ref) # Free deletes big objects from plasma store. big_id = sampler.sample_big.remote() ray.get(big_id) ray.internal.free(big_id) time.sleep(1) # wait for delete RPC to propagate with pytest.raises(Exception): ray.get(big_id)
def test_wait_makes_object_local(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) cluster.add_node(num_cpus=2) ray.init(address=cluster.address) @ray.remote class Foo: def method(self): return np.zeros(1024 * 1024) a = Foo.remote() # Test get makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ray.get(x_id) assert ray.worker.global_worker.core_worker.object_exists(x_id) # Test wait makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ok, _ = ray.wait([x_id]) assert len(ok) == 1 assert ray.worker.global_worker.core_worker.object_exists(x_id)
def test_actor_pass_by_ref_order_optimization(shutdown_only): ray.init(num_cpus=4) @ray.remote class Actor: def __init__(self): pass def f(self, x): pass a = Actor.remote() @ray.remote def fast_value(): print("fast value") pass @ray.remote def slow_value(): print("start sleep") time.sleep(30) @ray.remote def runner(f): print("runner", a, f) return ray.get(a.f.remote(f.remote())) runner.remote(slow_value) time.sleep(1) x2 = runner.remote(fast_value) start = time.time() ray.get(x2) delta = time.time() - start assert delta < 10, "did not skip slow value"
def test_multiple_waits_and_gets(shutdown_only): # It is important to use three workers here, so that the three tasks # launched in this experiment can run at the same time. ray.init(num_cpus=3) @ray.remote def f(delay): time.sleep(delay) return 1 @ray.remote def g(input_list): # The argument input_list should be a list containing one object ref. ray.wait([input_list[0]]) @ray.remote def h(input_list): # The argument input_list should be a list containing one object ref. ray.get(input_list[0]) # Make sure that multiple wait requests involving the same object ref # all return. x = f.remote(1) ray.get([g.remote([x]), g.remote([x])]) # Make sure that multiple get requests involving the same object ref all # return. x = f.remote(1) ray.get([h.remote([x]), h.remote([x])])
def test_background_tasks_with_max_calls(shutdown_only): ray.init(num_cpus=2) @ray.remote def g(): time.sleep(.1) return 0 @ray.remote(max_calls=1, max_retries=0) def f(): return [g.remote()] nested = ray.get([f.remote() for _ in range(10)]) # Should still be able to retrieve these objects, since f's workers will # wait for g to finish before exiting. ray.get([x[0] for x in nested]) @ray.remote(max_calls=1, max_retries=0) def f(): return os.getpid(), g.remote() nested = ray.get([f.remote() for _ in range(10)]) while nested: pid, g_id = nested.pop(0) ray.get(g_id) del g_id wait_for_pid_to_exit(pid)
def test_put_get(shutdown_only): ray.init(num_cpus=0) for i in range(100): value_before = i * 10**6 object_ref = ray.put(value_before) value_after = ray.get(object_ref) assert value_before == value_after for i in range(100): value_before = i * 10**6 * 1.0 object_ref = ray.put(value_before) value_after = ray.get(object_ref) assert value_before == value_after for i in range(100): value_before = "h" * i object_ref = ray.put(value_before) value_after = ray.get(object_ref) assert value_before == value_after for i in range(100): value_before = [1] * i object_ref = ray.put(value_before) value_after = ray.get(object_ref) assert value_before == value_after
def test_submit_api(shutdown_only): ray.init(num_cpus=2, num_gpus=1, resources={"Custom": 1}) @ray.remote def f(n): return list(range(n)) @ray.remote def g(): return ray.get_gpu_ids() assert f._remote([0], num_returns=0) is None id1 = f._remote(args=[1], num_returns=1) assert ray.get(id1) == [0] id1, id2 = f._remote(args=[2], num_returns=2) assert ray.get([id1, id2]) == [0, 1] id1, id2, id3 = f._remote(args=[3], num_returns=3) assert ray.get([id1, id2, id3]) == [0, 1, 2] assert ray.get( g._remote(args=[], num_cpus=1, num_gpus=1, resources={"Custom": 1})) == [0] infeasible_id = g._remote(args=[], resources={"NonexistentCustom": 1}) assert ray.get(g._remote()) == [] ready_ids, remaining_ids = ray.wait([infeasible_id], timeout=0.05) assert len(ready_ids) == 0 assert len(remaining_ids) == 1 @ray.remote class Actor: def __init__(self, x, y=0): self.x = x self.y = y def method(self, a, b=0): return self.x, self.y, a, b def gpu_ids(self): return ray.get_gpu_ids() @ray.remote class Actor2: def __init__(self): pass def method(self): pass a = Actor._remote(args=[0], kwargs={"y": 1}, num_gpus=1, resources={"Custom": 1}) a2 = Actor2._remote() ray.get(a2.method._remote()) id1, id2, id3, id4 = a.method._remote(args=["test"], kwargs={"b": 2}, num_returns=4) assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2]
def test_ignore_http_proxy(shutdown_only): ray.init(num_cpus=1) os.environ["http_proxy"] = "http://example.com" os.environ["https_proxy"] = "http://example.com" @ray.remote def f(): return 1 assert ray.get(f.remote()) == 1
def test_redefining_remote_functions(shutdown_only): ray.init(num_cpus=1) # Test that we can define a remote function in the shell. @ray.remote def f(x): return x + 1 assert ray.get(f.remote(0)) == 1 # Test that we can redefine the remote function. @ray.remote def f(x): return x + 10 while True: val = ray.get(f.remote(0)) assert val in [1, 10] if val == 10: break else: logger.info("Still using old definition of f, trying again.") # Check that we can redefine functions even when the remote function source # doesn't change (see https://github.com/ray-project/ray/issues/6130). @ray.remote def g(): return nonexistent() with pytest.raises(RayTaskError, match="nonexistent"): ray.get(g.remote()) def nonexistent(): return 1 # Redefine the function and make sure it succeeds. @ray.remote def g(): return nonexistent() assert ray.get(g.remote()) == 1 # Check the same thing but when the redefined function is inside of another # task. @ray.remote def h(i): @ray.remote def j(): return i return j.remote() for i in range(20): assert ray.get(ray.get(h.remote(i))) == i
def test_invalid_arguments(shutdown_only): ray.init(num_cpus=2) for opt in [np.random.randint(-100, -1), np.random.uniform(0, 1)]: with pytest.raises( ValueError, match="The keyword 'num_returns' only accepts 0 or a" " positive integer"): @ray.remote(num_returns=opt) def g1(): return 1 for opt in [np.random.randint(-100, -2), np.random.uniform(0, 1)]: with pytest.raises( ValueError, match="The keyword 'max_retries' only accepts 0, -1 or a" " positive integer"): @ray.remote(max_retries=opt) def g2(): return 1 for opt in [np.random.randint(-100, -1), np.random.uniform(0, 1)]: with pytest.raises( ValueError, match="The keyword 'max_calls' only accepts 0 or a positive" " integer"): @ray.remote(max_calls=opt) def g3(): return 1 for opt in [np.random.randint(-100, -2), np.random.uniform(0, 1)]: with pytest.raises( ValueError, match="The keyword 'max_restarts' only accepts -1, 0 or a" " positive integer"): @ray.remote(max_restarts=opt) class A1: x = 1 for opt in [np.random.randint(-100, -2), np.random.uniform(0, 1)]: with pytest.raises( ValueError, match="The keyword 'max_task_retries' only accepts -1, 0 or a" " positive integer"): @ray.remote(max_task_retries=opt) class A2: x = 1
def test_grpc_message_size(shutdown_only): ray.init(num_cpus=1) @ray.remote def bar(*a): return # 50KiB, not enough to spill to plasma, but will be inlined. def f(): return np.zeros(50000, dtype=np.uint8) # Executes a 10MiB task spec ray.get(bar.remote(*[f() for _ in range(200)]))
def test_wait_timing(shutdown_only): ray.init(num_cpus=2) @ray.remote def f(): time.sleep(1) future = f.remote() start = time.time() ready, not_ready = ray.wait([future], timeout=0.2) assert 0.2 < time.time() - start < 0.3 assert len(ready) == 0 assert len(not_ready) == 1
def test_defining_remote_functions(shutdown_only): ray.init(num_cpus=3) # Test that we can close over plain old data. data = [ np.zeros([3, 5]), (1, 2, "a"), [0.0, 1.0, 1 << 62], 1 << 60, { "a": np.zeros(3) } ] @ray.remote def g(): return data ray.get(g.remote()) # Test that we can close over modules. @ray.remote def h(): return np.zeros([3, 5]) assert np.alltrue(ray.get(h.remote()) == np.zeros([3, 5])) @ray.remote def j(): return time.time() ray.get(j.remote()) # Test that we can define remote functions that call other remote # functions. @ray.remote def k(x): return x + 1 @ray.remote def k2(x): return ray.get(k.remote(x)) @ray.remote def m(x): return ray.get(k2.remote(x)) assert ray.get(k.remote(1)) == 2 assert ray.get(k2.remote(1)) == 2 assert ray.get(m.remote(1)) == 2
def test_caching_functions_to_run(shutdown_only): # Test that we export functions to run on all workers before the driver # is connected. def f(worker_info): sys.path.append(1) ray.worker.global_worker.run_function_on_all_workers(f) def f(worker_info): sys.path.append(2) ray.worker.global_worker.run_function_on_all_workers(f) def g(worker_info): sys.path.append(3) ray.worker.global_worker.run_function_on_all_workers(g) def f(worker_info): sys.path.append(4) ray.worker.global_worker.run_function_on_all_workers(f) ray.init(num_cpus=1) @ray.remote def get_state(): time.sleep(1) return sys.path[-4], sys.path[-3], sys.path[-2], sys.path[-1] res1 = get_state.remote() res2 = get_state.remote() assert ray.get(res1) == (1, 2, 3, 4) assert ray.get(res2) == (1, 2, 3, 4) # Clean up the path on the workers. def f(worker_info): sys.path.pop() sys.path.pop() sys.path.pop() sys.path.pop() ray.worker.global_worker.run_function_on_all_workers(f)
def test_variable_number_of_args(shutdown_only): ray.init(num_cpus=1) @ray.remote def varargs_fct1(*a): return " ".join(map(str, a)) @ray.remote def varargs_fct2(a, *b): return " ".join(map(str, b)) x = varargs_fct1.remote(0, 1, 2) assert ray.get(x) == "0 1 2" x = varargs_fct2.remote(0, 1, 2) assert ray.get(x) == "1 2" @ray.remote def f1(*args): return args @ray.remote def f2(x, y, *args): return x, y, args assert ray.get(f1.remote()) == () assert ray.get(f1.remote(1)) == (1, ) assert ray.get(f1.remote(1, 2, 3)) == (1, 2, 3) with pytest.raises(Exception): f2.remote() with pytest.raises(Exception): f2.remote(1) assert ray.get(f2.remote(1, 2)) == (1, 2, ()) assert ray.get(f2.remote(1, 2, 3)) == (1, 2, (3, )) assert ray.get(f2.remote(1, 2, 3, 4)) == (1, 2, (3, 4)) def testNoArgs(self): @ray.remote def no_op(): pass self.ray_start() ray.get(no_op.remote())
def test_actor_call_order(shutdown_only): ray.init(num_cpus=4) @ray.remote def small_value(): time.sleep(0.01 * np.random.randint(0, 10)) return 0 @ray.remote class Actor: def __init__(self): self.count = 0 def inc(self, count, dependency): assert count == self.count self.count += 1 return count a = Actor.remote() assert ray.get([a.inc.remote(i, small_value.remote()) for i in range(100)]) == list(range(100))
def test_caching_actors(shutdown_only): # Test defining actors before ray.init() has been called. @ray.remote class Foo: def __init__(self): pass def get_val(self): return 3 # Check that we can't actually create actors before ray.init() has been # called. with pytest.raises(Exception): f = Foo.remote() ray.init(num_cpus=1) f = Foo.remote() assert ray.get(f.get_val.remote()) == 3
def test_fair_queueing(shutdown_only): ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1}) @ray.remote def h(): return 0 @ray.remote def g(): return ray.get(h.remote()) @ray.remote def f(): return ray.get(g.remote()) # This will never finish without fair queueing of {f, g, h}: # https://github.com/ray-project/ray/issues/3644 ready, _ = ray.wait([f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) assert len(ready) == 1000, len(ready)
def test_system_config_when_connecting(ray_start_cluster): config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200} cluster = ray.cluster_utils.Cluster() cluster.add_node( _system_config=config, object_store_memory=100 * 1024 * 1024) cluster.wait_for_nodes() # Specifying _system_config when connecting to a cluster is disallowed. with pytest.raises(ValueError): ray.init(address=cluster.address, _system_config=config) # Check that the config was picked up (object pinning is disabled). ray.init(address=cluster.address) obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref # This would not raise an exception if object pinning was enabled. with pytest.raises(ray.exceptions.ObjectLostError): ray.get(obj_ref)
def test_object_transfer_dump(ray_start_cluster): cluster = ray_start_cluster num_nodes = 3 for i in range(num_nodes): cluster.add_node(resources={str(i): 1}, object_store_memory=10**9) ray.init(address=cluster.address) @ray.remote def f(x): return # These objects will live on different nodes. object_refs = [ f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes) ] # Broadcast each object from each machine to each other machine. for object_ref in object_refs: ray.get([ f._remote(args=[object_ref], resources={str(i): 1}) for i in range(num_nodes) ]) # The profiling information only flushes once every second. time.sleep(1.1) transfer_dump = ray.object_transfer_timeline() # Make sure the transfer dump can be serialized with JSON. json.loads(json.dumps(transfer_dump)) assert len(transfer_dump) >= num_nodes**2 assert len({ event["pid"] for event in transfer_dump if event["name"] == "transfer_receive" }) == num_nodes assert len({ event["pid"] for event in transfer_dump if event["name"] == "transfer_send" }) == num_nodes
def test_wait_cluster(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) ray.init(address=cluster.address) @ray.remote(resources={"RemoteResource": 1}) def f(): return # Make sure we have enough workers on the remote nodes to execute some # tasks. tasks = [f.remote() for _ in range(10)] start = time.time() ray.get(tasks) end = time.time() # Submit some more tasks that can only be executed on the remote nodes. tasks = [f.remote() for _ in range(10)] # Sleep for a bit to let the tasks finish. time.sleep((end - start) * 2) _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) # All remote tasks should have finished. assert len(unready) == 0
def test_omp_threads_set(shutdown_only): ray.init(num_cpus=1) # Should have been auto set by ray init. assert os.environ["OMP_NUM_THREADS"] == "1"
def test_call_matrix(shutdown_only): ray.init(object_store_memory=1000 * 1024 * 1024) @ray.remote class Actor: def small_value(self): return 0 def large_value(self): return np.zeros(10 * 1024 * 1024) def echo(self, x): if isinstance(x, list): x = ray.get(x[0]) return x @ray.remote def small_value(): return 0 @ray.remote def large_value(): return np.zeros(10 * 1024 * 1024) @ray.remote def echo(x): if isinstance(x, list): x = ray.get(x[0]) return x def check(source_actor, dest_actor, is_large, out_of_band): print("CHECKING", "actor" if source_actor else "task", "to", "actor" if dest_actor else "task", "large_object" if is_large else "small_object", "out_of_band" if out_of_band else "in_band") if source_actor: a = Actor.remote() if is_large: x_id = a.large_value.remote() else: x_id = a.small_value.remote() else: if is_large: x_id = large_value.remote() else: x_id = small_value.remote() if out_of_band: x_id = [x_id] if dest_actor: b = Actor.remote() x = ray.get(b.echo.remote(x_id)) else: x = ray.get(echo.remote(x_id)) if is_large: assert isinstance(x, np.ndarray) else: assert isinstance(x, int) for is_large in [False, True]: for source_actor in [False, True]: for dest_actor in [False, True]: for out_of_band in [False, True]: check(source_actor, dest_actor, is_large, out_of_band)
def test_many_fractional_resources(shutdown_only): ray.init(num_cpus=2, num_gpus=2, resources={"Custom": 2}) @ray.remote def g(): return 1 @ray.remote def f(block, accepted_resources): true_resources = { resource: value[0][1] for resource, value in ray.get_resource_ids().items() } if block: ray.get(g.remote()) return dicts_equal(true_resources, accepted_resources) # Check that the resource are assigned correctly. result_ids = [] for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): resource_set = {"CPU": int(rand1 * 10000) / 10000} result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} result_ids.append( f._remote([False, resource_set], resources={"Custom": rand1})) resource_set = { "CPU": int(rand1 * 10000) / 10000, "GPU": int(rand2 * 10000) / 10000, "Custom": int(rand3 * 10000) / 10000 } result_ids.append( f._remote([False, resource_set], num_cpus=rand1, num_gpus=rand2, resources={"Custom": rand3})) result_ids.append( f._remote([True, resource_set], num_cpus=rand1, num_gpus=rand2, resources={"Custom": rand3})) assert all(ray.get(result_ids)) # Check that the available resources at the end are the same as the # beginning. stop_time = time.time() + 10 correct_available_resources = False while time.time() < stop_time: available_resources = ray.available_resources() if ("CPU" in available_resources and ray.available_resources()["CPU"] == 2.0 and "GPU" in available_resources and ray.available_resources()["GPU"] == 2.0 and "Custom" in available_resources and ray.available_resources()["Custom"] == 2.0): correct_available_resources = True break if not correct_available_resources: assert False, "Did not get correct available resources."