def test_global_state_actor_table(ray_start_regular):
    @ray.remote
    class Actor:
        def ready(self):
            pass

    # actor table should be empty at first
    assert len(ray.actors()) == 0

    # actor table should contain only one entry
    a = Actor.remote()
    ray.get(a.ready.remote())
    assert len(ray.actors()) == 1

    # actor table should contain only this entry
    # even when the actor goes out of scope
    del a

    def get_state():
        return list(ray.actors().values())[0]["State"]

    dead_state = ray.gcs_utils.ActorTableData.DEAD
    for _ in range(10):
        if get_state() == dead_state:
            break
        else:
            time.sleep(0.5)
    assert get_state() == dead_state
Example #2
0
def test_global_state_api(shutdown_only):

    error_message = ("The ray global state API cannot be used "
                     "before ray.init has been called.")

    with pytest.raises(Exception, match=error_message):
        ray.objects()

    with pytest.raises(Exception, match=error_message):
        ray.actors()

    with pytest.raises(Exception, match=error_message):
        ray.nodes()

    with pytest.raises(Exception, match=error_message):
        ray.jobs()

    ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1})

    assert ray.cluster_resources()["CPU"] == 5
    assert ray.cluster_resources()["GPU"] == 3
    assert ray.cluster_resources()["CustomResource"] == 1

    assert ray.objects() == {}

    job_id = ray.utils.compute_job_id_from_driver(
        ray.WorkerID(ray.worker.global_worker.worker_id))

    client_table = ray.nodes()
    node_ip_address = ray.worker.global_worker.node_ip_address

    assert len(client_table) == 1
    assert client_table[0]["NodeManagerAddress"] == node_ip_address

    @ray.remote
    class Actor:
        def __init__(self):
            pass

    _ = Actor.remote()  # noqa: F841
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    actor_info, = actor_table.values()
    assert actor_info["JobID"] == job_id.hex()
    assert "IPAddress" in actor_info["Address"]
    assert "IPAddress" in actor_info["OwnerAddress"]
    assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"]

    job_table = ray.jobs()

    assert len(job_table) == 1
    assert job_table[0]["JobID"] == job_id.hex()
    assert job_table[0]["NodeManagerAddress"] == node_ip_address
Example #3
0
 def wait_until_actor_dead(handle):
     actor_status = ray.actors(actor_id=handle._actor_id.hex())
     max_wait_time = 10
     wait_time = 0
     while actor_status["State"] != ray.gcs_utils.ActorTableData.DEAD:
         actor_status = ray.actors(actor_id=handle._actor_id.hex())
         time.sleep(1.0)
         wait_time += 1
         if wait_time >= max_wait_time:
             assert None, (
                 "It took too much time to kill an actor: {}".format(
                     handle._actor_id))
Example #4
0
def kill_actor_and_wait_for_failure(actor, timeout=10, retry_interval_ms=100):
    actor_id = actor._actor_id.hex()
    current_num_restarts = ray.actors(actor_id)["NumRestarts"]
    ray.kill(actor)
    start = time.time()
    while time.time() - start <= timeout:
        actor_status = ray.actors(actor_id)
        if actor_status["State"] == ray.gcs_utils.ActorTableData.DEAD \
                or actor_status["NumRestarts"] > current_num_restarts:
            return
        time.sleep(retry_interval_ms / 1000.0)
    raise RuntimeError(
        "It took too much time to kill an actor: {}".format(actor_id))
Example #5
0
def test_http_head_only():
    cluster = Cluster()
    head_node = cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2

    client = serve.start(http_options={
        "port": new_port(),
        "location": "HeadOnly"
    })

    # Only the controller and head node actor should be started
    assert len(ray.actors()) == 2

    # They should all be placed on the head node
    cpu_per_nodes = {
        r["CPU"]
        for r in ray.state.state._available_resources_per_node().values()
    }
    assert cpu_per_nodes == {2, 4}

    client.shutdown()
    ray.shutdown()
    cluster.shutdown()
Example #6
0
def wait_for_num_actors(num_actors, timeout=10):
    start_time = time.time()
    while time.time() - start_time < timeout:
        if len(ray.actors()) >= num_actors:
            return
        time.sleep(0.1)
    raise RayTestTimeoutException("Timed out while waiting for global state.")
Example #7
0
 def create_and_kill_actor(actor_name):
     # Make sure same name is creatable after killing it.
     detached_actor = DetachedActor.options(name=actor_name).remote()
     # Wait for detached actor creation.
     assert ray.get(detached_actor.ping.remote()) == "pong"
     ray.kill(detached_actor)
     # Wait until actor dies.
     actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
     max_wait_time = 10
     wait_time = 0
     while actor_status["State"] != 3:
         actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
         time.sleep(1.0)
         wait_time += 1
         if wait_time >= max_wait_time:
             assert None, (
                 "It took too much time to kill an actor: {}".format(
                     detached_actor._actor_id))
Example #8
0
def test_placement_group_strict_spread(ray_start_cluster):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 3
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    placement_group = ray.util.placement_group(
        name="name",
        strategy="STRICT_SPREAD",
        bundles=[{
            "CPU": 2
        }, {
            "CPU": 2
        }, {
            "CPU": 2
        }])
    ray.get(placement_group.ready())
    actor_1 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=0).remote()
    actor_2 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=1).remote()
    actor_3 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=2).remote()

    ray.get(actor_1.value.remote())
    ray.get(actor_2.value.remote())
    ray.get(actor_3.value.remote())

    # Get all actors.
    actor_infos = ray.actors()

    # Make sure all actors in counter_list are located in separate nodes.
    actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
    actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
    actor_info_3 = actor_infos.get(actor_3._actor_id.hex())

    assert actor_info_1 and actor_info_2 and actor_info_3

    node_of_actor_1 = actor_info_1["Address"]["NodeID"]
    node_of_actor_2 = actor_info_2["Address"]["NodeID"]
    node_of_actor_3 = actor_info_3["Address"]["NodeID"]
    assert node_of_actor_1 != node_of_actor_2
    assert node_of_actor_1 != node_of_actor_3
    assert node_of_actor_2 != node_of_actor_3
Example #9
0
def wait_for_num_actors(num_actors, state=None, timeout=10):
    start_time = time.time()
    while time.time() - start_time < timeout:
        if len([
                _ for _ in ray.actors().values()
                if state is None or _["State"] == state
        ]) >= num_actors:
            return
        time.sleep(0.1)
    raise RayTestTimeoutException("Timed out while waiting for global state.")
Example #10
0
def test_capture_child_tasks(ray_start_cluster):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    pg = ray.util.placement_group([{
        "CPU": 2
    }, {
        "CPU": 2
    }],
                                  strategy="STRICT_PACK")
    ray.get(pg.ready(), timeout=5)

    # If get_current_placement_group is used when the current worker/driver
    # doesn't belong to any of placement group, it should return None.
    assert get_current_placement_group() is None

    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor(self):
            actor = NestedActor.options(
                placement_group=get_current_placement_group()).remote()
            ray.get(actor.ready.remote())
            self.actors.append(actor)

    a = Actor.options(placement_group=pg).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor.remote())
    # Make sure all the actors are scheduled on the same node.
    # (why? The placement group has STRICT_PACK strategy).
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    # Since all node id should be identical, set should be equal to 1.
    assert len(node_id_set) == 1
Example #11
0
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers):
    """Tests whether the appropriate number of training actors are created."""
    model = BoringModel()

    def check_num_actor():
        assert len(ray.actors()) == num_workers

    model.on_epoch_end = check_num_actor
    plugin = RayPlugin(num_workers=num_workers)
    trainer = get_trainer(tmpdir, plugins=[plugin])
    trainer.fit(model)
    assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
               for actor in list(ray.actors().values()))
Example #12
0
def test_global_state_api(shutdown_only):

    ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1})

    assert ray.cluster_resources()["CPU"] == 5
    assert ray.cluster_resources()["GPU"] == 3
    assert ray.cluster_resources()["CustomResource"] == 1

    # A driver/worker creates a temporary object during startup. Although the
    # temporary object is freed immediately, in a rare case, we can still find
    # the object ref in GCS because Raylet removes the object ref from GCS
    # asynchronously.
    # Because we can't control when workers create the temporary objects, so
    # We can't assert that `ray.objects()` returns an empty dict. Here we just
    # make sure `ray.objects()` succeeds.
    assert len(ray.objects()) >= 0

    job_id = ray.utils.compute_job_id_from_driver(
        ray.WorkerID(ray.worker.global_worker.worker_id))

    client_table = ray.nodes()
    node_ip_address = ray.worker.global_worker.node_ip_address

    assert len(client_table) == 1
    assert client_table[0]["NodeManagerAddress"] == node_ip_address

    @ray.remote
    class Actor:
        def __init__(self):
            pass

    _ = Actor.options(name="test_actor").remote()  # noqa: F841
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    actor_info, = actor_table.values()
    assert actor_info["JobID"] == job_id.hex()
    assert actor_info["Name"] == "test_actor"
    assert "IPAddress" in actor_info["Address"]
    assert "IPAddress" in actor_info["OwnerAddress"]
    assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"]

    job_table = ray.jobs()

    assert len(job_table) == 1
    assert job_table[0]["JobID"] == job_id.hex()
    assert job_table[0]["DriverIPAddress"] == node_ip_address
Example #13
0
def test_placement_group_pack(ray_start_cluster):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    placement_group = ray.util.placement_group(
        name="name",
        strategy="PACK",
        bundles=[
            {
                "CPU": 2,
                "GPU": 0  # Test 0 resource spec doesn't break tests.
            },
            {
                "CPU": 2
            }
        ])
    ray.get(placement_group.ready())
    actor_1 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=0).remote()
    actor_2 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=1).remote()

    ray.get(actor_1.value.remote())
    ray.get(actor_2.value.remote())

    # Get all actors.
    actor_infos = ray.actors()

    # Make sure all actors in counter_list are collocated in one node.
    actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
    actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

    assert actor_info_1 and actor_info_2

    node_of_actor_1 = actor_info_1["Address"]["NodeID"]
    node_of_actor_2 = actor_info_2["Address"]["NodeID"]
    assert node_of_actor_1 == node_of_actor_2
Example #14
0
 def _mock_train(*args, _training_state, **kwargs):
     try:
         results = _train(
             *args, _training_state=_training_state, **kwargs)
         return results
     except Exception:
         raise
     finally:
         assert len(_training_state.actors) == num_actors
         if not any(a is None for a in _training_state.actors):
             actor_infos = ray.actors()
             actor_nodes = []
             for a in _training_state.actors:
                 actor_info = actor_infos.get(a._actor_id.hex())
                 actor_node = actor_info["Address"]["NodeID"]
                 actor_nodes.append(actor_node)
             assert actor_nodes[0] == actor_nodes[1]
Example #15
0
def test_no_http(ray_shutdown):
    # The following should have the same effect.
    options = [
        {
            "http_host": None
        },
        {
            "http_options": {
                "host": None
            }
        },
        {
            "http_options": {
                "location": None
            }
        },
        {
            "http_options": {
                "location": "NoServer"
            }
        },
    ]

    ray.init(num_cpus=16)
    for i, option in enumerate(options):
        print(f"[{i+1}/{len(options)}] Running with {option}")
        serve.start(**option)

        # Only controller actor should exist
        live_actors = [
            actor for actor in ray.actors().values()
            if actor["State"] == ray.gcs_utils.ActorTableData.ALIVE
        ]
        assert len(live_actors) == 1
        controller = serve.api._global_client._controller
        assert len(ray.get(controller.get_http_proxies.remote())) == 0

        # Test that the handle still works.
        def hello(*args):
            return "hello"

        serve.create_backend("backend", hello)
        serve.create_endpoint("endpoint", backend="backend")

        assert ray.get(serve.get_handle("endpoint").remote()) == "hello"
        serve.shutdown()
def test_placement_group_strict_pack(ray_start_cluster):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    placement_group_id = ray.experimental.placement_group(
        name="name", strategy="STRICT_PACK", bundles=[{
            "CPU": 2
        }, {
            "CPU": 2
        }])
    actor_1 = Actor.options(
        placement_group_id=placement_group_id,
        placement_group_bundle_index=0).remote()
    actor_2 = Actor.options(
        placement_group_id=placement_group_id,
        placement_group_bundle_index=1).remote()

    print(ray.get(actor_1.value.remote()))
    print(ray.get(actor_2.value.remote()))

    # Get all actors.
    actor_infos = ray.actors()

    # Make sure all actors in counter_list are collocated in one node.
    actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
    actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

    assert actor_info_1 and actor_info_2

    node_of_actor_1 = actor_info_1["Address"]["NodeID"]
    node_of_actor_2 = actor_info_2["Address"]["NodeID"]
    assert node_of_actor_1 == node_of_actor_2
Example #17
0
def _get_actor(name):
    if ray._raylet.gcs_actor_service_enabled():
        worker = ray.worker.global_worker
        handle = worker.core_worker.get_named_actor_handle(name)
    else:
        actor_name = _calculate_key(name)
        pickled_state = _internal_kv_get(actor_name)
        if pickled_state is None:
            raise ValueError(
                "The actor with name={} doesn't exist".format(name))
        handle = pickle.loads(pickled_state)
        # If the actor state is dead, that means that this name is reusable.
        # We don't delete the name entry from key value store when
        # the actor is killed because ray.kill is asynchronous,
        # and it can cause worker leaks.
        actor_info = ray.actors(actor_id=handle._actor_id.hex())
        actor_state = actor_info.get("State", None)
        if actor_state and actor_state == ActorTableData.DEAD:
            raise ValueError("The actor with name={} is dead.".format(name))
    return handle
Example #18
0
def test_job_gc(call_ray_start):
    address = call_ray_start

    ray.init(address=address)
    driver = """
import ray

ray.init(address="{}")

@ray.remote
class Actor:
    def __init__(self):
        pass

_ = Actor.remote()
""".format(address)

    p = run_string_as_driver_nonblocking(driver)
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    job_table = ray.jobs()
    assert len(job_table) == 2  # dash

    # Kill the driver process.
    p.kill()
    p.wait()

    def actor_finish():
        actor_table = ray.actors()
        if (len(actor_table) == 0):
            return True
        else:
            return False

    wait_for_condition(actor_finish)
Example #19
0
def test_job_gc_with_detached_actor(call_ray_start):
    address = call_ray_start

    ray.init(address=address)
    driver = """
import ray

ray.init(address="{}")

@ray.remote
class Actor:
    def __init__(self):
        pass

    def value(self):
        return 1

_ = Actor.options(lifetime="detached", name="DetachedActor").remote()
# Make sure the actor is created before the driver exits.
ray.get(_.value.remote())
""".format(address)

    p = run_string_as_driver_nonblocking(driver)
    # Wait for actor to be created
    wait_for_num_actors(1, ray.gcs_utils.ActorTableData.ALIVE)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    job_table = ray.jobs()
    assert len(job_table) == 2  # dash

    # Kill the driver process.
    p.kill()
    p.wait()

    detached_actor = ray.get_actor("DetachedActor")
    assert ray.get(detached_actor.value.remote()) == 1
Example #20
0
def test_job_gc_with_detached_actor(call_ray_start):
    address = call_ray_start

    ray.init(address=address)
    driver = """
import ray

ray.init(address="{}")

@ray.remote
class Actor:
    def __init__(self):
        pass

    def value(self):
        return 1

_ = Actor.options(name="DetachedActor").remote()
""".format(address)

    p = run_string_as_driver_nonblocking(driver)
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    job_table = ray.jobs()
    assert len(job_table) == 2

    # Kill the driver process.
    p.kill()
    p.wait()

    detached_actor = ray.get_actor("DetachedActor")
    assert ray.get(detached_actor.value.remote()) == 1
Example #21
0
def test_no_http():
    # The following should have the same effect.
    options = [
        {
            "http_host": None
        },
        {
            "http_options": {
                "host": None
            }
        },
        {
            "http_options": {
                "location": None
            }
        },
        {
            "http_options": {
                "location": "NoServer"
            }
        },
    ]

    ray.init()
    for option in options:
        client = serve.start(**option)

        # Only controller actor should exist
        live_actors = [
            actor for actor in ray.actors().values()
            if actor["State"] == ray.gcs_utils.ActorTableData.ALIVE
        ]
        assert len(live_actors) == 1

        client.shutdown()
    ray.shutdown()
Example #22
0
def test_global_state_actor_entry(ray_start_regular):
    @ray.remote
    class Actor:
        def ready(self):
            pass

    # actor table should be empty at first
    assert len(ray.actors()) == 0

    a = Actor.remote()
    b = Actor.remote()
    ray.get(a.ready.remote())
    ray.get(b.ready.remote())
    assert len(ray.actors()) == 2
    a_actor_id = a._actor_id.hex()
    b_actor_id = b._actor_id.hex()
    assert ray.actors(actor_id=a_actor_id)["ActorID"] == a_actor_id
    assert ray.actors(actor_id=a_actor_id)["State"] == 1
    assert ray.actors(actor_id=b_actor_id)["ActorID"] == b_actor_id
    assert ray.actors(actor_id=b_actor_id)["State"] == 1
Example #23
0
def _all_actors_dead():
    return all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
               for actor in list(ray.actors().values()))
Example #24
0
def get_node_id_for_actor(actor_handle):
    """Given an actor handle, return the node id it's placed on."""

    return ray.actors()[actor_handle._actor_id.hex()]["Address"]["NodeID"]
Example #25
0
def test_capture_child_actors(ray_start_cluster):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    pg = ray.util.placement_group(
        [{
            "CPU": 2
        }, {
            "CPU": 2
        }], strategy="STRICT_PACK")
    ray.get(pg.ready())

    # If get_current_placement_group is used when the current worker/driver
    # doesn't belong to any of placement group, it should return None.
    assert get_current_placement_group() is None

    # Test actors first.
    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor(self):
            # Make sure we can capture the current placement group.
            assert get_current_placement_group() is not None
            # Actors should be implicitly captured.
            actor = NestedActor.remote()
            ray.get(actor.ready.remote())
            self.actors.append(actor)

        def schedule_nested_actor_outside_pg(self):
            # Don't use placement group.
            actor = NestedActor.options(placement_group=None).remote()
            ray.get(actor.ready.remote())
            self.actors.append(actor)

    a = Actor.options(placement_group=pg).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor.remote())
    # Make sure all the actors are scheduled on the same node.
    # (why? The placement group has STRICT_PACK strategy).
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    # Since all node id should be identical, set should be equal to 1.
    assert len(node_id_set) == 1

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # Now create an actor, but do not capture the current tasks
    a = Actor.options(
        placement_group=pg,
        placement_group_capture_child_tasks=False).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor.remote())
    # Make sure all the actors are not scheduled on the same node.
    # It is because the child tasks are not scheduled on the same
    # placement group.
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    assert len(node_id_set) == 2

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # Lastly, make sure when None is specified, actors are not scheduled
    # on the same placement group.
    a = Actor.options(placement_group=pg).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor_outside_pg.remote())
    # Make sure all the actors are not scheduled on the same node.
    # It is because the child tasks are not scheduled on the same
    # placement group.
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    assert len(node_id_set) == 2
Example #26
0
 def check_num_actor():
     assert len(ray.actors()) == num_workers
Example #27
0
 def assert_alive_num_actor(expected_num_actor):
     alive_num_actor = 0
     for actor_info in ray.actors().values():
         if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE:
             alive_num_actor += 1
     return alive_num_actor == expected_num_actor
Example #28
0
    def run(self):
        p = self.redis_client.pubsub(ignore_subscribe_messages=True)

        p.psubscribe(self.redis_key)
        logger.info("NodeStats: subscribed to {}".format(self.redis_key))

        log_channel = ray.gcs_utils.LOG_FILE_CHANNEL
        p.subscribe(log_channel)
        logger.info("NodeStats: subscribed to {}".format(log_channel))

        error_channel = ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB")
        p.subscribe(error_channel)
        logger.info("NodeStats: subscribed to {}".format(error_channel))

        actor_channel = ray.gcs_utils.TablePubsub.Value("ACTOR_PUBSUB")
        p.subscribe(actor_channel)
        logger.info("NodeStats: subscribed to {}".format(actor_channel))

        current_actor_table = ray.actors()
        with self._node_stats_lock:
            for actor_data in current_actor_table.values():
                addr = (actor_data["Address"]["IPAddress"],
                        str(actor_data["Address"]["Port"]))
                owner_addr = (actor_data["OwnerAddress"]["IPAddress"],
                              str(actor_data["OwnerAddress"]["Port"]))
                self._addr_to_owner_addr[addr] = owner_addr
                self._addr_to_actor_id[addr] = actor_data["ActorID"]
                self._addr_to_extra_info_dict[addr] = {
                    "jobId": actor_data["JobID"],
                    "state": actor_data["State"],
                    "isDirectCall": actor_data["IsDirectCall"],
                    "timestamp": actor_data["Timestamp"]
                }

        for x in p.listen():
            try:
                with self._node_stats_lock:
                    channel = ray.utils.decode(x["channel"])
                    data = x["data"]
                    if channel == log_channel:
                        data = json.loads(ray.utils.decode(data))
                        ip = data["ip"]
                        pid = str(data["pid"])
                        self._logs[ip][pid].extend(data["lines"])
                    elif channel == str(error_channel):
                        gcs_entry = ray.gcs_utils.GcsEntry.FromString(data)
                        error_data = ray.gcs_utils.ErrorTableData.FromString(
                            gcs_entry.entries[0])
                        message = error_data.error_message
                        message = re.sub(r"\x1b\[\d+m", "", message)
                        match = re.search(r"\(pid=(\d+), ip=(.*?)\)", message)
                        if match:
                            pid = match.group(1)
                            ip = match.group(2)
                            self._errors[ip][pid].append({
                                "message":
                                message,
                                "timestamp":
                                error_data.timestamp,
                                "type":
                                error_data.type
                            })
                    elif channel == str(actor_channel):
                        gcs_entry = ray.gcs_utils.GcsEntry.FromString(data)
                        actor_data = ray.gcs_utils.ActorTableData.FromString(
                            gcs_entry.entries[0])
                        addr = (actor_data.address.ip_address,
                                str(actor_data.address.port))
                        owner_addr = (actor_data.owner_address.ip_address,
                                      str(actor_data.owner_address.port))
                        self._addr_to_owner_addr[addr] = owner_addr
                        self._addr_to_actor_id[addr] = ray.utils.binary_to_hex(
                            actor_data.actor_id)
                        self._addr_to_extra_info_dict[addr] = {
                            "jobId":
                            ray.utils.binary_to_hex(actor_data.job_id),
                            "state": actor_data.state,
                            "isDirectCall": actor_data.is_direct_call,
                            "timestamp": actor_data.timestamp
                        }
                    else:
                        data = json.loads(ray.utils.decode(data))
                        self._node_stats[data["hostname"]] = data
            except Exception:
                logger.exception(traceback.format_exc())
                continue
Example #29
0
 def get_state():
     return list(ray.actors().values())[0]["State"]
Example #30
0
 def test_cond():
     alive_actors = [
         v for v in real_ray.actors().values()
         if v["State"] != ActorTableData.DEAD
     ]
     return len(alive_actors) == 0