Beispiel #1
0
 def check_actor_restart():
     actors = list(ray.state.actors().values())
     assert len(actors) == 1
     print(actors)
     return (actors[0]["State"] == convert_actor_state(
         gcs_utils.ActorTableData.ALIVE) and actors[0]["NumRestarts"] == 1)
def _all_actors_dead():
    return all(
        actor["State"] == convert_actor_state(gcs_utils.ActorTableData.DEAD)
        for actor in list(ray.state.actors().values()))
Beispiel #3
0
def test_capture_child_actors(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group([{
            "CPU": 2
        }, {
            "CPU": 2
        }],
                                      strategy="STRICT_PACK")
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test actors first.
        @ray.remote(num_cpus=1)
        class NestedActor:
            def ready(self):
                return True

        @ray.remote(num_cpus=1)
        class Actor:
            def __init__(self):
                self.actors = []

            def ready(self):
                return True

            def schedule_nested_actor(self):
                # Make sure we can capture the current placement group.
                assert get_current_placement_group() is not None
                # Actors should be implicitly captured.
                actor = NestedActor.remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

            def schedule_nested_actor_outside_pg(self):
                # Don't use placement group.
                actor = NestedActor.options(placement_group=None).remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

        a = Actor.options(placement_group=pg,
                          placement_group_capture_child_tasks=True).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are scheduled on the same node.
        # (why? The placement group has STRICT_PACK strategy).
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        # Since all node id should be identical, set should be equal to 1.
        assert len(node_id_set) == 1

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Now create an actor, but do not capture the current tasks
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Lastly, make sure when None is specified, actors are not scheduled
        # on the same placement group.
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor_outside_pg.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2