@ray.remote class Increase: def method(self, x): return x + 2 @ray.remote def increase(x): return x + 1 @pytest.mark.parametrize( "ray_start_regular", [ generate_system_config_map( num_heartbeats_timeout=2, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_gcs_server_restart(ray_start_regular): actor1 = Increase.remote() result = ray.get(actor1.method.remote(1)) assert result == 3 ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() actor2 = Increase.remote() result = ray.get(actor2.method.remote(2)) assert result == 4 result = ray.get(increase.remote(1))
ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) @pytest.mark.parametrize( "ray_start_cluster_head_with_external_redis", [ generate_system_config_map(num_heartbeats_timeout=10, gcs_rpc_server_reconnect_timeout_s=60) ], indirect=True, ) def test_create_placement_group_after_gcs_server_restart( ray_start_cluster_head_with_external_redis, ): cluster = ray_start_cluster_head_with_external_redis cluster.add_node(num_cpus=2) cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Create placement group 1 successfully. placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}]) ray.get(placement_group1.ready(), timeout=10) table = ray.util.placement_group_table(placement_group1) assert table["state"] == "CREATED"
g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2]) def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2]) @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( num_heartbeats_timeout=3, object_timeout_milliseconds=12345) ], indirect=True) def test_system_config(ray_start_cluster_head): """Checks that the internal configuration setting works. We set the cluster to timeout nodes after 2 seconds of no timeouts. We then remove a node, wait for 1 second to check that the cluster is out of sync, then wait another 2 seconds (giving 1 second of leeway) to check that the client has timed out. We also check to see if the config is set. """ cluster = ray_start_cluster_head worker = cluster.add_node() cluster.wait_for_nodes() @ray.remote
remote_actor = Actor.remote() assert ray.get(RetryableTask.remote(remote_actor)) == 3 @pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.") # NOTE(hchen): we set object_timeout_milliseconds to 1s for # this test. Because if this value is too small, suprious task reconstruction # may happen and cause the test fauilure. If the value is too large, this test # could be very slow. We can remove this once we support dynamic timeout. @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( object_timeout_milliseconds=1000, num_heartbeats_timeout=10 ) ], indirect=True, ) def test_multiple_actor_restart(ray_start_cluster_head): cluster = ray_start_cluster_head # This test can be made more stressful by increasing the numbers below. # The total number of actors created will be # num_actors_at_a_time * num_nodes. num_nodes = 5 num_actors_at_a_time = 3 num_function_calls_at_a_time = 10 worker_nodes = [cluster.add_node(num_cpus=3) for _ in range(num_nodes)]
@ray.remote class Increase: def method(self, x): return x + 2 @ray.remote def increase(x): return x + 1 @pytest.mark.parametrize( "ray_start_regular_with_external_redis", [ generate_system_config_map(num_heartbeats_timeout=20, gcs_rpc_server_reconnect_timeout_s=60) ], indirect=True, ) def test_gcs_server_restart(ray_start_regular_with_external_redis): actor1 = Increase.remote() result = ray.get(actor1.method.remote(1)) assert result == 3 ray._private.worker._global_node.kill_gcs_server() ray._private.worker._global_node.start_gcs_server() actor2 = Increase.remote() result = ray.get(actor2.method.remote(2)) assert result == 4