Exemple #1
0
def test_max_running_tasks():
    counter = Semaphore.remote(0)
    blocker = Semaphore.remote(0)

    @ray.remote(num_cpus=0.25)
    def task(counter, blocker):
        sleep(300)

    refs = [
        task.remote(counter, blocker)
        for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks")
    ]

    max_cpus = ray.cluster_resources()["CPU"]
    min_cpus_available = max_cpus
    for _ in trange(int(300 / 0.1), desc="Waiting"):
        try:
            cur_cpus = ray.available_resources().get("CPU", 0)
            min_cpus_available = min(min_cpus_available, cur_cpus)
        except Exception:
            # There are race conditions `.get` can fail if a new heartbeat
            # comes at the same time.
            pass
        sleep(0.1)

    # There are some relevant magic numbers in this check. 10k tasks each
    # require 1/4 cpus. Therefore, ideally 2.5k cpus will be used.
    err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used."
    assert max_cpus - min_cpus_available > 2000, err_str

    for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER,
                    desc="Ensuring all tasks have finished"):
        done, refs = ray.wait(refs)
        assert ray.get(done[0]) is None
Exemple #2
0
def test_for_each_concur_sync(ray_start_regular_shared):
    main_wait = Semaphore.remote(value=0)
    test_wait = Semaphore.remote(value=0)

    def task(x):
        i, main_wait, test_wait = x
        ray.get(main_wait.release.remote())
        ray.get(test_wait.acquire.remote())
        return i + 10

    @ray.remote(num_cpus=0.01)
    def to_list(it):
        return list(it)

    it = from_items([(i, main_wait, test_wait) for i in range(8)],
                    num_shards=2)
    it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.01})

    list_promise = to_list.remote(it.gather_sync())

    for i in range(4):
        assert i in [0, 1, 2, 3]
        ray.get(main_wait.acquire.remote())

    # There should be exactly 4 tasks executing at this point.
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    for i in range(8):
        ray.get(test_wait.release.remote())

    assert repr(
        it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]"
    result_list = ray.get(list_promise)
    assert set(result_list) == set(range(10, 18))
Exemple #3
0
def test_hybrid_policy(ray_start_cluster):

    cluster = ray_start_cluster
    num_nodes = 2
    num_cpus = 10
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpus, memory=num_cpus)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # `block_task` ensures that scheduled tasks do not return until all are
    # running.
    block_task = Semaphore.remote(0)
    # `block_driver` ensures that the driver does not allow tasks to continue
    # until all are running.
    block_driver = Semaphore.remote(0)

    # Add the memory resource because the cpu will be released in the ray.get
    @ray.remote(num_cpus=1, memory=1)
    def get_node():
        ray.get(block_driver.release.remote())
        ray.get(block_task.acquire.remote())
        return ray.worker.global_worker.current_node_id

    # Below the hybrid threshold we pack on the local node first.
    refs = [get_node.remote() for _ in range(5)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    assert len(set(nodes)) == 1

    # We pack the second node to the hybrid threshold.
    refs = [get_node.remote() for _ in range(10)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    counter = collections.Counter(nodes)
    for node_id in counter:
        print(f"{node_id}: {counter[node_id]}")
        assert counter[node_id] == 5

    # Once all nodes are past the hybrid threshold we round robin.
    # TODO (Alex): Ideally we could schedule less than 20 nodes here, but the
    # policy is imperfect if a resource report interrupts the process.
    refs = [get_node.remote() for _ in range(20)]
    ray.get([block_driver.acquire.remote() for _ in refs])
    ray.get([block_task.release.remote() for _ in refs])
    nodes = ray.get(refs)
    counter = collections.Counter(nodes)
    for node_id in counter:
        print(f"{node_id}: {counter[node_id]}")
        assert counter[node_id] == 10, counter
Exemple #4
0
def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    remote_wait = Semaphore.remote(value=0)
    nested_wait = Semaphore.remote(value=0)

    ray.get([
        remote_wait.locked.remote(),
        nested_wait.locked.remote(),
    ])

    @ray.remote
    def f():
        time.sleep(1000)
        return 1

    @ray.remote
    def h(nested_waits):
        nested_wait.release.remote()
        ray.get(nested_waits)
        ray.get(f.remote())

    @ray.remote
    def g(remote_waits, nested_waits):
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        remote_wait.release.remote()
        # wait until every lock is released.
        ray.get(remote_waits)
        ray.get(h.remote(nested_waits))

    num_root_tasks = num_cpus * 4
    # Lock remote task until everything is scheduled.
    remote_waits = []
    nested_waits = []
    for _ in range(num_root_tasks):
        remote_waits.append(remote_wait.acquire.remote())
        nested_waits.append(nested_wait.acquire.remote())

    [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)]

    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()
Exemple #5
0
def test_many_queued_tasks():
    sema = Semaphore.remote(0)

    @ray.remote(num_cpus=1)
    def block():
        ray.get(sema.acquire.remote())

    @ray.remote(num_cpus=1)
    def f():
        pass

    num_cpus = int(ray.cluster_resources()["CPU"])
    blocked_tasks = []
    for _ in range(num_cpus):
        blocked_tasks.append(block.remote())

    print("Submitting many tasks")
    pending_tasks = []
    for _ in trange(MAX_QUEUED_TASKS):
        pending_tasks.append(f.remote())

    # Make sure all the tasks can actually run.
    for _ in range(num_cpus):
        sema.release.remote()

    print("Unblocking tasks")
    for ref in tqdm(pending_tasks):
        assert ray.get(ref) is None
Exemple #6
0
def test_for_each_concur(ray_start_regular_shared):
    main_wait = Semaphore.remote(value=0)
    test_wait = Semaphore.remote(value=0)

    def task(x):
        i, main_wait, test_wait = x
        ray.get(main_wait.release.remote())
        ray.get(test_wait.acquire.remote())
        return i + 10

    @ray.remote(num_cpus=0.1)
    def to_list(it):
        return list(it)

    it = from_items([(i, main_wait, test_wait) for i in range(8)],
                    num_shards=2)
    it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.1})

    for i in range(4):
        ray.get(main_wait.acquire.remote())

    # There should be exactly 4 tasks executing at this point.
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    # When we finish one task, exactly one more should start.
    ray.get(test_wait.release.remote())
    ray.get(main_wait.acquire.remote())
    assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism"

    # Finish everything and make sure the output matches a regular iterator.
    for i in range(3):
        ray.get(test_wait.release.remote())

    assert repr(
        it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]"
    assert ray.get(to_list.remote(it.gather_sync())) == list(range(10, 18))