Beispiel #1
0
def test_connect_with_disconnected_node(shutdown_only):
    config = {
        "num_heartbeats_timeout": 50,
        "raylet_heartbeat_timeout_milliseconds": 10,
    }
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    p = init_error_pubsub()
    errors = get_error_message(p, 1, timeout=5)
    assert len(errors) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
    assert len(errors) == 1
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(removing_node, allow_graceful=True)
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    # There is no connection error to a dead node.
    errors = get_error_message(p, 1, timeout=2)
    assert len(errors) == 0
    p.close()
Beispiel #2
0
def test_warning_for_too_many_actors(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)

    p = init_error_pubsub()

    @ray.remote
    class Foo:
        def __init__(self):
            time.sleep(1000)

    # NOTE: We should save actor, otherwise it will be out of scope.
    actor_group1 = [Foo.remote() for _ in range(num_cpus * 3)]
    assert len(actor_group1) == num_cpus * 3
    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR

    actor_group2 = [Foo.remote() for _ in range(num_cpus)]
    assert len(actor_group2) == num_cpus
    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Beispiel #3
0
def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    @ray.remote
    def f():
        time.sleep(1000)
        return 1

    @ray.remote
    def h():
        time.sleep(1)
        ray.get(f.remote())

    @ray.remote
    def g():
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        time.sleep(1)
        ray.get(h.remote())

    [g.remote() for _ in range(num_cpus * 4)]
    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Beispiel #4
0
def test_detached_warning(shutdown_only):
    ray.init()

    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    error_pubsub = init_error_pubsub()
    actor = DetachedActor.options(  # noqa: F841
        name="Pinger", lifetime="detached").remote()
    errors = get_error_message(error_pubsub, 1, None)
    error = errors.pop()
    assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
Beispiel #5
0
def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    remote_wait = Semaphore.remote(value=0)
    nested_wait = Semaphore.remote(value=0)

    ray.get([
        remote_wait.locked.remote(),
        nested_wait.locked.remote(),
    ])

    @ray.remote
    def f():
        time.sleep(1000)
        return 1

    @ray.remote
    def h(nested_waits):
        nested_wait.release.remote()
        ray.get(nested_waits)
        ray.get(f.remote())

    @ray.remote
    def g(remote_waits, nested_waits):
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        remote_wait.release.remote()
        # wait until every lock is released.
        ray.get(remote_waits)
        ray.get(h.remote(nested_waits))

    num_root_tasks = num_cpus * 4
    # Lock remote task until everything is scheduled.
    remote_waits = []
    nested_waits = []
    for _ in range(num_root_tasks):
        remote_waits.append(remote_wait.acquire.remote())
        nested_waits.append(nested_wait.acquire.remote())

    [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)]

    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Beispiel #6
0
def test_warning_actor_waiting_on_actor(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Actor:
        pass

    a = Actor.remote()  # noqa
    b = Actor.remote()  # noqa

    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
Beispiel #7
0
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):
    # Check that we cannot place an actor on a 0 CPU machine and that we get an
    # infeasibility warning (even though the actor creation task itself
    # requires no CPUs).

    ray.init(num_cpus=0)
    p = init_error_pubsub()

    @ray.remote
    class Foo:
        pass

    # The actor creation should be infeasible.
    Foo.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR
    p.close()
Beispiel #8
0
def test_warning_all_tasks_blocked(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Foo:
        def f(self):
            return 0

    @ray.remote
    def f():
        # Creating both actors is not possible.
        actors = [Foo.remote() for _ in range(3)]
        for a in actors:
            ray.get(a.f.remote())

    # Run in a task to check we handle the blocked task case correctly
    f.remote()
    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
Beispiel #9
0
def test_warning_many_actor_tasks_queued(shutdown_only):
    ray.init(num_cpus=1)
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Foo:
        def f(self):
            import time
            time.sleep(1)

    a = Foo.remote()
    [a.f.remote() for _ in range(50000)]
    errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING)
    msgs = [e.error_message for e in errors]
    assert ("Warning: More than 5000 tasks are pending submission to actor"
            in msgs[0])
    assert ("Warning: More than 10000 tasks are pending submission to actor"
            in msgs[1])
    assert ("Warning: More than 20000 tasks are pending submission to actor"
            in msgs[2])
    assert ("Warning: More than 40000 tasks are pending submission to actor"
            in msgs[3])
Beispiel #10
0
def error_pubsub():
    p = init_error_pubsub()
    yield p
    p.close()
def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)
    p = init_error_pubsub()

    # There shouldn't be any errors yet.
    errors = get_error_message(p, 1, 2)
    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    errors = get_error_message(p, 1)

    # Make sure we got the error.
    assert len(errors) == 1
    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time
from ray.test_utils import (init_error_pubsub, get_error_message)

ray.init(address="{}")
p = init_error_pubsub()
time.sleep(1)
errors = get_error_message(p, 1, 2)
assert len(errors) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

errors = get_error_message(p, 1)
assert len(errors) == 1

assert "{}" in errors[0].error_message

print("success")
""".format(address, error_string2, error_string2)

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    errors = get_error_message(p, 1)
    assert len(errors) == 1
    p.close()