def set_count(self, count): _internal_kv_put("count", count, True) # Verify we can get the object successfully. ra = RestartableActor.remote() ray.get(ra.f.remote()) @pytest.mark.parametrize("ray_start_regular", [{ "num_cpus": 2, "resources": { "a": 1 } }], indirect=True) @pytest.mark.skipif(new_scheduler_enabled(), reason="todo hangs") def test_pending_actor_removed_by_owner(ray_start_regular): # Verify when an owner of pending actors is killed, the actor resources # are correctly returned. @ray.remote(num_cpus=1, resources={"a": 1}) class A: def __init__(self): self.actors = [] def create_actors(self): self.actors = [B.remote() for _ in range(2)] @ray.remote(resources={"a": 1}) class B: def ping(self):
dep = one_dep_large.remote(None, signal=signal1) check_refcounts({dep: (1, 0)}) result = one_dep.remote(dep, signal=signal2, fail=True) check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal1.send.remote()) ray.get(dep, timeout=10) # Reference count should remain because the dependency is in plasma. check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal2.send.remote()) # Reference count should be removed because the task finished. check_refcounts({dep: (1, 0), result: (1, 0)}) del dep, result check_refcounts({}) @pytest.mark.skipif(new_scheduler_enabled(), reason="dynamic res todo") def test_actor_creation_task(ray_start_regular): @ray.remote def large_object(): # This will be spilled to plasma. return np.zeros(10 * 1024 * 1024, dtype=np.uint8) @ray.remote(resources={"init": 1}) class Actor: def __init__(self, dependency): return def ping(self): return a = Actor.remote(large_object.remote())
new_scheduler_enabled, ) @ray.remote class Increase: def method(self, x): return x + 2 @ray.remote def increase(x): return x + 1 @pytest.mark.skipif(new_scheduler_enabled(), reason="broken") @pytest.mark.parametrize("ray_start_regular", [ generate_system_config_map(num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_gcs_server_restart(ray_start_regular): actor1 = Increase.remote() result = ray.get(actor1.method.remote(1)) assert result == 3 ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() result = ray.get(actor1.method.remote(7)) assert result == 9
# Run a chain of tasks which exceed `num_cpus` in amount, but the CPU # resource requirement is still within `num_cpus`. obj = foo.remote(4) wait_for_condition(lambda: len(get_workers()) == 4) ray.get(obj) # After finished the tasks, some workers are killed to keep the total # number of workers <= num_cpus. wait_for_condition(lambda: len(get_workers()) == 2) time.sleep(1) # The two remaining workers stay alive forever. assert len(get_workers()) == 2 @pytest.mark.skipif(new_scheduler_enabled(), reason="fails") def test_worker_capping_fifo(shutdown_only): # Start 2 initial workers by setting num_cpus to 2. info = ray.init(num_cpus=2) wait_for_condition(lambda: len(get_workers()) == 2) time.sleep(1) @ray.remote def getpid(): return os.getpid() worker1, worker2 = get_workers() if worker1.pid == ray.get(getpid.remote()): worker1, worker2 = [worker2, worker1]
print("Counts are {}.".format(counts)) if (len(names) == num_nodes and all(count >= minimum_count for count in counts)): break attempts += 1 assert attempts < num_attempts # Make sure we can get the results of a bunch of tasks. results = [] for _ in range(1000): index = np.random.randint(num_actors) results.append(actors[index].get_location.remote()) ray.get(results) @pytest.mark.skipif(new_scheduler_enabled(), reason="multi node broken") def test_actor_lifetime_load_balancing(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) num_nodes = 3 for i in range(num_nodes): cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote(num_cpus=1) class Actor: def __init__(self): pass def ping(self): return
def f(): time.sleep(0.01) return ray.worker.global_worker.node.unique_id def local(): return ray.get(f.remote()) == ray.worker.global_worker.node.unique_id # Wait for a worker to get started. wait_for_condition(local) # Check that we are scheduling locally while there are resources available. for i in range(20): assert local() @pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often") def test_load_balancing_with_dependencies(ray_start_cluster): # This test ensures that tasks are being assigned to all raylets in a # roughly equal manner even when the tasks have dependencies. cluster = ray_start_cluster num_nodes = 3 for _ in range(num_nodes): cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote def f(x): time.sleep(0.010) return ray.worker.global_worker.node.unique_id # This object will be local to one of the raylets. Make sure
replay_buffer.append(ref) solution_buffer.append(arr) print("-----------------------------------") # randomly sample objects for _ in range(1000): index = random.choice(list(range(buffer_length))) ref = replay_buffer[index] solution = solution_buffer[index] sample = ray.get(ref, timeout=0) assert np.array_equal(sample, solution) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") @pytest.mark.skipif(new_scheduler_enabled(), reason="hangs") def test_spill_during_get(object_spilling_config, shutdown_only): ray.init( num_cpus=4, object_store_memory=100 * 1024 * 1024, _system_config={ "automatic_object_spilling_enabled": True, "object_store_full_initial_delay_ms": 100, # NOTE(swang): Use infinite retries because the OOM timer can still # get accidentally triggered when objects are released too slowly # (see github.com/ray-project/ray/issues/12040). "object_store_full_max_retries": -1, "max_io_workers": 1, "object_spilling_config": object_spilling_config, "min_spilling_size": 0, },
new_scheduler_enabled, ) @ray.remote class Increase: def method(self, x): return x + 2 @ray.remote def increase(x): return x + 1 @pytest.mark.skipif(new_scheduler_enabled(), reason="notimpl") @pytest.mark.parametrize( "ray_start_regular", [ generate_system_config_map( num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_gcs_server_restart(ray_start_regular): actor1 = Increase.remote() result = ray.get(actor1.method.remote(1)) assert result == 3 ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() actor2 = Increase.remote()
return 1 ray.get(f.remote()) # We should be able to create an actor that requires 0 CPU resources. @ray.remote(num_cpus=0) class Actor: def method(self): pass a = Actor.remote() x = a.method.remote() ray.get(x) @pytest.mark.skipif(new_scheduler_enabled(), reason="zero cpu handling") def test_zero_cpus_actor(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) valid_node = cluster.add_node(num_cpus=2) ray.init(address=cluster.address) @ray.remote class Foo: def method(self): return ray.worker.global_worker.node.unique_id # Make sure tasks and actors run on the remote raylet. a = Foo.remote() assert valid_node.unique_id == ray.get(a.method.remote())
if demand.shape == one_cpu_shape: one_cpu_found = True assert one_cpu_found # Check that we differentiate between infeasible and ready tasks. for demand in checker.report: if resource2 in demand.shape: assert demand.num_infeasible_requests_queued > 0 assert demand.num_ready_requests_queued == 0 else: assert demand.num_ready_requests_queued > 0 assert demand.num_infeasible_requests_queued == 0 global_state_accessor.disconnect() @pytest.mark.skipif(new_scheduler_enabled(), reason="requires placement groups") def test_placement_group_load_report(ray_start_cluster): cluster = ray_start_cluster # Add a head node that doesn't have gpu resource. cluster.add_node(num_cpus=4) ray.init(address=cluster.address) global_state_accessor = GlobalStateAccessor( cluster.address, ray.ray_constants.REDIS_DEFAULT_PASSWORD) global_state_accessor.connect() class PgLoadChecker: def nothing_is_ready(self): resource_usage = self._read_resource_usage() if not resource_usage: return False