Example #1
0
def test_limit_concurrency(shutdown_only):
    ray.init(num_cpus=1)

    block_task = Semaphore.remote(0)
    block_driver = Semaphore.remote(0)

    ray.get([block_task.locked.remote(), block_driver.locked.remote()])

    @ray.remote(num_cpus=1)
    def foo():
        ray.get(block_driver.release.remote())
        ray.get(block_task.acquire.remote())

    refs = [foo.remote() for _ in range(20)]

    block_driver_refs = [block_driver.acquire.remote() for _ in range(20)]

    # Some of the tasks will run since we relax the cap, but not all because it
    # should take exponentially long for the cap to be increased.
    ready, not_ready = ray.wait(block_driver_refs, timeout=10, num_returns=20)
    assert len(not_ready) >= 1

    # Now the first instance of foo finishes, so the second starts to run.
    ray.get([block_task.release.remote() for _ in range(19)])

    ready, not_ready = ray.wait(block_driver_refs, timeout=10, num_returns=20)
    assert len(not_ready) == 0

    ready, not_ready = ray.wait(refs, num_returns=20, timeout=15)
    assert len(ready) == 19
    assert len(not_ready) == 1
Example #2
0
def test_for_each_concur_sync(ray_start_regular_shared):
    main_wait = Semaphore.remote(value=0)
    test_wait = Semaphore.remote(value=0)

    def task(x):
        i, main_wait, test_wait = x
        ray.get(main_wait.release.remote())
        ray.get(test_wait.acquire.remote())
        return i + 10

    @ray.remote(num_cpus=0.01)
    def to_list(it):
        return list(it)

    it = from_items([(i, main_wait, test_wait) for i in range(8)],
                    num_shards=2)
    it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.01})

    list_promise = to_list.remote(it.gather_sync())

    for i in range(4):
        assert i in [0, 1, 2, 3]
        ray.get(main_wait.acquire.remote())

    # There should be exactly 4 tasks executing at this point.
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    for i in range(8):
        ray.get(test_wait.release.remote())

    assert repr(
        it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]"
    result_list = ray.get(list_promise)
    assert set(result_list) == set(range(10, 18))
Example #3
0
def test_hybrid_policy(ray_start_cluster):

    cluster = ray_start_cluster
    num_nodes = 2
    num_cpus = 10
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpus, memory=num_cpus)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # `block_task` ensures that scheduled tasks do not return until all are
    # running.
    block_task = Semaphore.remote(0)
    # `block_driver` ensures that the driver does not allow tasks to continue
    # until all are running.
    block_driver = Semaphore.remote(0)

    # Add the memory resource because the cpu will be released in the ray.get
    @ray.remote(num_cpus=1, memory=1)
    def get_node():
        ray.get(block_driver.release.remote())
        ray.get(block_task.acquire.remote())
        return ray.worker.global_worker.current_node_id

    # Below the hybrid threshold we pack on the local node first.
    refs = [get_node.remote() for _ in range(5)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    assert len(set(nodes)) == 1

    # We pack the second node to the hybrid threshold.
    refs = [get_node.remote() for _ in range(10)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    counter = collections.Counter(nodes)
    for node_id in counter:
        print(f"{node_id}: {counter[node_id]}")
        assert counter[node_id] == 5

    # Once all nodes are past the hybrid threshold we round robin.
    # TODO (Alex): Ideally we could schedule less than 20 nodes here, but the
    # policy is imperfect if a resource report interrupts the process.
    refs = [get_node.remote() for _ in range(20)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    counter = collections.Counter(nodes)
    for node_id in counter:
        print(f"{node_id}: {counter[node_id]}")
        assert counter[node_id] == 10, counter
Example #4
0
def test_worker_failed(ray_start_workers_separate_multinode):
    num_nodes, num_initial_workers = ray_start_workers_separate_multinode

    block_worker = Semaphore.remote(0)
    block_driver = Semaphore.remote(0)
    ray.get([block_worker.locked.remote(), block_driver.locked.remote()])

    # Acquire a custom resource that isn't released on `ray.get` to make sure
    # this task gets spread across all the nodes.
    @ray.remote(num_cpus=1, resources={"custom": 1})
    def get_pids():
        ray.get(block_driver.release.remote())
        ray.get(block_worker.acquire.remote())
        return os.getpid()

    total_num_workers = num_nodes * num_initial_workers
    pid_refs = [get_pids.remote() for _ in range(total_num_workers)]
    ray.get([block_driver.acquire.remote() for _ in range(total_num_workers)])
    ray.get([block_worker.release.remote() for _ in range(total_num_workers)])

    pids = set(ray.get(pid_refs))

    @ray.remote
    def f(x):
        time.sleep(0.5)
        return x

    # Submit more tasks than there are workers so that all workers and
    # cores are utilized.
    object_refs = [f.remote(i) for i in range(num_initial_workers * num_nodes)]
    object_refs += [f.remote(object_ref) for object_ref in object_refs]
    # Allow the tasks some time to begin executing.
    time.sleep(0.1)
    # Kill the workers as the tasks execute.
    for pid in pids:
        try:
            os.kill(pid, SIGKILL)
        except OSError:
            # The process may have already exited due to worker capping.
            pass
        time.sleep(0.1)
    # Make sure that we either get the object or we get an appropriate
    # exception.
    for object_ref in object_refs:
        try:
            ray.get(object_ref)
        except (ray.exceptions.RayTaskError,
                ray.exceptions.WorkerCrashedError):
            pass
Example #5
0
def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    remote_wait = Semaphore.remote(value=0)
    nested_wait = Semaphore.remote(value=0)

    ray.get([
        remote_wait.locked.remote(),
        nested_wait.locked.remote(),
    ])

    @ray.remote(num_cpus=0.25)
    def f():
        time.sleep(1000)
        return 1

    @ray.remote(num_cpus=0.25)
    def h(nested_waits):
        nested_wait.release.remote()
        ray.get(nested_waits)
        ray.get(f.remote())

    @ray.remote(num_cpus=0.25)
    def g(remote_waits, nested_waits):
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        remote_wait.release.remote()
        # wait until every lock is released.
        ray.get(remote_waits)
        ray.get(h.remote(nested_waits))

    num_root_tasks = num_cpus * 4
    # Lock remote task until everything is scheduled.
    remote_waits = []
    nested_waits = []
    for _ in range(num_root_tasks):
        remote_waits.append(remote_wait.acquire.remote())
        nested_waits.append(nested_wait.acquire.remote())

    [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)]

    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Example #6
0
def test_many_queued_tasks():
    sema = Semaphore.remote(0)

    @ray.remote(num_cpus=1)
    def block():
        ray.get(sema.acquire.remote())

    @ray.remote(num_cpus=1)
    def f():
        pass

    num_cpus = int(ray.cluster_resources()["CPU"])
    blocked_tasks = []
    for _ in range(num_cpus):
        blocked_tasks.append(block.remote())

    print("Submitting many tasks")
    pending_tasks = []
    for _ in trange(MAX_QUEUED_TASKS):
        pending_tasks.append(f.remote())

    # Make sure all the tasks can actually run.
    for _ in range(num_cpus):
        sema.release.remote()

    print("Unblocking tasks")
    for ref in tqdm(pending_tasks):
        assert ray.get(ref) is None
Example #7
0
def test_back_pressure(shutdown_only_with_initialization_check):
    ray.init()

    signal_actor = Semaphore.options(max_pending_calls=10).remote(value=0)

    try:
        for i in range(10):
            signal_actor.acquire.remote()
    except ray.exceptions.PendingCallsLimitExceeded:
        assert False

    with pytest.raises(ray.exceptions.PendingCallsLimitExceeded):
        signal_actor.acquire.remote()

    @ray.remote
    def release(signal_actor):
        ray.get(signal_actor.release.remote())
        return 1

    # Release signal actor through common task,
    # because actor tasks will be back pressured
    for i in range(10):
        ray.get(release.remote(signal_actor))

    # Check whether we can call remote actor normally after
    # back presssure released.
    try:
        signal_actor.acquire.remote()
    except ray.exceptions.PendingCallsLimitExceeded:
        assert False

    ray.shutdown()
Example #8
0
def test_task_status(ray_start_regular):
    address = ray_start_regular["address"]

    @ray.remote
    def dep(sema, x=None):
        ray.get(sema.acquire.remote())
        return

    @ray.remote(num_gpus=1)
    def impossible():
        pass

    # Filter out actor handle refs.
    def filtered_summary():
        data = "\n".join(
            [
                line
                for line in memory_summary(address, line_wrap=False).split("\n")
                if "ACTOR_HANDLE" not in line
            ]
        )
        print(data)
        return data

    sema = Semaphore.remote(value=0)
    x = dep.remote(sema)
    y = dep.remote(sema, x=x)
    im = impossible.remote()  # noqa
    # x and its semaphore task are scheduled. im cannot
    # be scheduled, so it is pending forever.
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 1)
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 1)

    z = dep.remote(sema, x=x)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 2)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 0)

    sema.release.remote()
    time.sleep(2)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 1)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    # y, z, and two semaphore tasks are scheduled.
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 4)

    sema.release.remote()
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 2)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2)

    sema.release.remote()
    ray.get(y)
    ray.get(z)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 3)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 0)
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 1)
def test_zero_cpu_scheduling(shutdown_only):
    ray.init(num_cpus=1)

    block_task = Semaphore.remote(0)
    block_driver = Semaphore.remote(0)

    @ray.remote(num_cpus=0)
    def foo():
        ray.get(block_driver.release.remote())
        ray.get(block_task.acquire.remote())

    foo.remote()
    foo.remote()

    ray.get(block_driver.acquire.remote())

    block_driver_ref = block_driver.acquire.remote()

    # Both tasks should be running, so the driver should be unblocked.
    ready, not_ready = ray.wait([block_driver_ref], timeout=1)
    assert len(not_ready) == 0
Example #10
0
def test_task_status(ray_start_regular):
    address = ray_start_regular["address"]

    @ray.remote
    def dep(sema, x=None):
        ray.get(sema.acquire.remote())
        return

    # Filter out actor handle refs.
    def filtered_summary():
        return "\n".join(
            [
                line
                for line in memory_summary(address, line_wrap=False).split("\n")
                if "ACTOR_HANDLE" not in line
            ]
        )

    sema = Semaphore.remote(value=0)
    x = dep.remote(sema)
    y = dep.remote(sema, x=x)
    # x and its semaphore task are scheduled.
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 1)

    z = dep.remote(sema, x=x)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 2)
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 0)

    sema.release.remote()
    time.sleep(2)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 1)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    # y, z, and two semaphore tasks are scheduled.
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 4)

    sema.release.remote()
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 2)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2)

    sema.release.remote()
    ray.get(y)
    ray.get(z)
    wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 3)
    wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0)
    wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 0)
Example #11
0
def test_multi_driver_logging(ray_start_regular):
    address_info = ray_start_regular
    address = address_info["address"]

    # ray.init(address=address)
    driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0)
    driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0)
    main_wait = Semaphore.options(name="main_wait").remote(value=0)

    # The creation of an actor is asynchronous.
    # We need to wait for the completion of the actor creation,
    # otherwise we can't get the actor by name.
    ray.get(driver1_wait.locked.remote())
    ray.get(driver2_wait.locked.remote())
    ray.get(main_wait.locked.remote())

    # Params are address, semaphore name, output1, output2
    driver_script_template = """
import ray
import sys
from ray._private.test_utils import Semaphore

@ray.remote(num_cpus=0)
def remote_print(s, file=None):
    print(s, file=file)

ray.init(address="{}", namespace="default_test_namespace")

driver_wait = ray.get_actor("{}")
main_wait = ray.get_actor("main_wait")

ray.get(main_wait.release.remote())
ray.get(driver_wait.acquire.remote())

s1 = "{}"
ray.get(remote_print.remote(s1))

ray.get(main_wait.release.remote())
ray.get(driver_wait.acquire.remote())

s2 = "{}"
ray.get(remote_print.remote(s2))

ray.get(main_wait.release.remote())
    """

    p1 = run_string_as_driver_nonblocking(
        driver_script_template.format(address, "driver1_wait", "1", "2"))
    p2 = run_string_as_driver_nonblocking(
        driver_script_template.format(address, "driver2_wait", "3", "4"))

    ray.get(main_wait.acquire.remote())
    ray.get(main_wait.acquire.remote())
    # At this point both of the other drivers are fully initialized.

    ray.get(driver1_wait.release.remote())
    ray.get(driver2_wait.release.remote())

    # At this point driver1 should receive '1' and driver2 '3'
    ray.get(main_wait.acquire.remote())
    ray.get(main_wait.acquire.remote())

    ray.get(driver1_wait.release.remote())
    ray.get(driver2_wait.release.remote())

    # At this point driver1 should receive '2' and driver2 '4'
    ray.get(main_wait.acquire.remote())
    ray.get(main_wait.acquire.remote())

    driver1_out = p1.stdout.read().decode("ascii")
    driver2_out = p2.stdout.read().decode("ascii")
    if sys.platform == "win32":
        driver1_out = driver1_out.replace("\r", "")
        driver2_out = driver2_out.replace("\r", "")
    driver1_out_split = driver1_out.split("\n")
    driver2_out_split = driver2_out.split("\n")

    assert driver1_out_split[0][-1] == "1", driver1_out_split
    assert driver1_out_split[1][-1] == "2", driver1_out_split
    assert driver2_out_split[0][-1] == "3", driver2_out_split
    assert driver2_out_split[1][-1] == "4", driver2_out_split
Example #12
0
def test_memory_util(ray_start_cluster):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
        "object_timeout_milliseconds": 200,
    }

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
        resources={"head": 1},
        _system_config=config,
        enable_object_reconstruction=True,
    )
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.wait_for_nodes()

    @ray.remote
    def large_object(sema=None):
        if sema is not None:
            ray.get(sema.acquire.remote())
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x, sema):
        ray.get(sema.acquire.remote())
        return x

    def stats():
        info = memory_summary(cluster.address, line_wrap=False)
        print(info)
        info = info.split("\n")
        reconstructing_waiting = [
            line for line in info
            if "Attempt #2" in line and WAITING_FOR_DEPENDENCIES in line
        ]
        reconstructing_scheduled = [
            line for line in info
            if "Attempt #2" in line and WAITING_FOR_EXECUTION in line
        ]
        reconstructing_finished = [
            line for line in info if "Attempt #2" in line and FINISHED in line
        ]
        return (
            len(reconstructing_waiting),
            len(reconstructing_scheduled),
            len(reconstructing_finished),
        )

    sema = Semaphore.options(resources={"head": 1}).remote(value=0)
    obj = large_object.options(resources={"node1": 1}).remote(sema)
    x = dependent_task.options(resources={"node1": 1}).remote(obj, sema)
    ref = dependent_task.options(resources={"node1": 1}).remote(x, sema)
    ray.get(sema.release.remote())
    ray.get(sema.release.remote())
    ray.get(sema.release.remote())
    ray.get(ref)
    wait_for_condition(lambda: stats() == (0, 0, 0))
    del ref

    cluster.remove_node(node_to_kill, allow_graceful=False)
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)

    ref = dependent_task.remote(x, sema)
    wait_for_condition(lambda: stats() == (1, 1, 0))
    ray.get(sema.release.remote())
    wait_for_condition(lambda: stats() == (0, 1, 1))
    ray.get(sema.release.remote())
    ray.get(sema.release.remote())
    ray.get(ref)
    wait_for_condition(lambda: stats() == (0, 0, 2))