Python kill_actor_and_wait_for_failure Examples, ray._private.test_utils.kill_actor_and_wait_for_failure Python Examples

Example #1

0

Show file

def test_actor_holding_serialized_reference(one_worker_100MiB, use_ray_put,
                                            failure):
    @ray.remote
    class GreedyActor(object):
        def __init__(self):
            pass

        def set_ref1(self, ref):
            self.ref1 = ref

        def add_ref2(self, new_ref):
            self.ref2 = new_ref

        def delete_ref1(self):
            self.ref1 = None

        def delete_ref2(self):
            self.ref2 = None

    # Test that the reference held by the actor isn't evicted.
    array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    actor = GreedyActor.remote()
    actor.set_ref1.remote([array_oid])

    # Test that giving the same actor a duplicate reference works.
    ray.get(actor.add_ref2.remote([array_oid]))

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Test that the remote references still pin the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Test that removing only the first reference doesn't unpin the object.
    ray.get(actor.delete_ref1.remote())
    _fill_object_store_and_get(array_oid_bytes)

    if failure:
        # Test that the actor exiting stops the reference from being pinned.
        # Kill the actor and wait for the actor to exit.
        kill_actor_and_wait_for_failure(actor)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(actor.delete_ref1.remote())
    else:
        # Test that deleting the second reference stops it from being pinned.
        ray.get(actor.delete_ref2.remote())
    _fill_object_store_and_get(array_oid_bytes, succeed=False)

Example #2

0

Show file

def test_kill(ray_start_regular_shared):
    @ray.remote
    class Actor:
        def hang(self):
            while True:
                time.sleep(1)

    actor = Actor.remote()
    result = actor.hang.remote()
    ready, _ = ray.wait([result], timeout=0.5)
    assert len(ready) == 0
    kill_actor_and_wait_for_failure(actor)

    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(result)

    with pytest.raises(ValueError):
        ray.kill("not_an_actor_handle")

Example #3

0

Show file

File: test_placement_group_2.py Project: wuisawesome/ray

def test_capture_child_actors(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group([{
            "CPU": 2
        }, {
            "CPU": 2
        }],
                                      strategy="STRICT_PACK")
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test actors first.
        @ray.remote(num_cpus=1)
        class NestedActor:
            def ready(self):
                return True

        @ray.remote(num_cpus=1)
        class Actor:
            def __init__(self):
                self.actors = []

            def ready(self):
                return True

            def schedule_nested_actor(self):
                # Make sure we can capture the current placement group.
                assert get_current_placement_group() is not None
                # Actors should be implicitly captured.
                actor = NestedActor.remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

            def schedule_nested_actor_outside_pg(self):
                # Don't use placement group.
                actor = NestedActor.options(placement_group=None).remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

        a = Actor.options(placement_group=pg,
                          placement_group_capture_child_tasks=True).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are scheduled on the same node.
        # (why? The placement group has STRICT_PACK strategy).
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        # Since all node id should be identical, set should be equal to 1.
        assert len(node_id_set) == 1

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Now create an actor, but do not capture the current tasks
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Lastly, make sure when None is specified, actors are not scheduled
        # on the same placement group.
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor_outside_pg.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2

Example #4

0

Show file

def test_detached_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    for _ in range(2):
        cluster.add_node(num_cpus=3)
    cluster.wait_for_nodes()
    info = ray.init(address=cluster.address)

    # Make sure detached placement group will alive when job dead.
    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

pg = ray.util.placement_group(
        [{{"CPU": 1}} for _ in range(2)],
        strategy="STRICT_SPREAD", lifetime="detached")
ray.get(pg.ready())

@ray.remote(num_cpus=1)
class Actor:
    def ready(self):
        return True

for bundle_index in range(2):
    actor = Actor.options(lifetime="detached", placement_group=pg,
                placement_group_bundle_index=bundle_index).remote()
    ray.get(actor.ready.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.state.jobs()
        for job in jobs:
            if job["IsDead"]:
                return True
        return False

    def assert_alive_num_pg(expected_num_pg):
        alive_num_pg = 0
        for _, placement_group_info in ray.util.placement_group_table().items(
        ):
            if placement_group_info["state"] == "CREATED":
                alive_num_pg += 1
        return alive_num_pg == expected_num_pg

    def assert_alive_num_actor(expected_num_actor):
        alive_num_actor = 0
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == gcs_utils.ActorTableData.ALIVE:
                alive_num_actor += 1
        return alive_num_actor == expected_num_actor

    wait_for_condition(is_job_done)

    assert assert_alive_num_pg(1)
    assert assert_alive_num_actor(2)

    # Make sure detached placement group will alive when its creator which
    # is detached actor dead.
    # Test actors first.
    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor_with_detached_pg(self):
            # Create placement group which is detached.
            pg = ray.util.placement_group([{
                "CPU": 1
            } for _ in range(2)],
                                          strategy="STRICT_SPREAD",
                                          lifetime="detached",
                                          name="detached_pg")
            ray.get(pg.ready())
            # Schedule nested actor with the placement group.
            for bundle_index in range(2):
                actor = NestedActor.options(
                    placement_group=pg,
                    placement_group_bundle_index=bundle_index,
                    lifetime="detached").remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

    a = Actor.options(lifetime="detached").remote()
    ray.get(a.ready.remote())
    # 1 parent actor and 2 children actor.
    ray.get(a.schedule_nested_actor_with_detached_pg.remote())

    # Kill an actor and wait until it is killed.
    kill_actor_and_wait_for_failure(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # We should have 2 alive pgs and 4 alive actors.
    assert assert_alive_num_pg(2)
    assert assert_alive_num_actor(4)