Exemple #1
0
def test_apply_async(pool):
    def f(arg1, arg2, kwarg1=None, kwarg2=None):
        assert arg1 == 1
        assert arg2 == 2
        assert kwarg1 is None
        assert kwarg2 == 3
        return 1

    assert pool.apply_async(f, (1, 2), {"kwarg2": 3}).get() == 1
    with pytest.raises(AssertionError):
        pool.apply_async(
            f,
            (
                2,
                2,
            ),
            {
                "kwarg2": 3
            },
        ).get()
    with pytest.raises(Exception):
        pool.apply_async(f, (1, )).get()
    with pytest.raises(Exception):
        pool.apply_async(f, (1, 2), {"kwarg1": 3}).get()

    # Won't return until the input ObjectRef is fulfilled.
    def ten_over(args):
        signal, val = args
        ray.get(signal.wait.remote())
        return 10 / val

    signal = SignalActor.remote()
    result = pool.apply_async(ten_over, ([signal, 10], ))
    result.wait(timeout=0.01)
    assert not result.ready()
    with pytest.raises(TimeoutError):
        result.get(timeout=0.01)

    # Fulfill the ObjectRef.
    ray.get(signal.send.remote())
    result.wait(timeout=10)
    assert result.ready()
    assert result.successful()
    assert result.get() == 1

    signal = SignalActor.remote()
    result = pool.apply_async(ten_over, ([signal, 0], ))
    with pytest.raises(ValueError, match="not ready"):
        result.successful()

    # Fulfill the ObjectRef with 0, causing the task to fail (divide by zero).
    ray.get(signal.send.remote())
    result.wait(timeout=10)
    assert result.ready()
    assert not result.successful()
    with pytest.raises(ZeroDivisionError):
        result.get()
Exemple #2
0
def test_imap_timeout(pool_4_processes, use_iter):
    def f(args):
        index, wait_index, signal = args
        time.sleep(0.1 * random.random())
        if index == wait_index:
            ray.get(signal.wait.remote())
        return index

    wait_index = 23
    signal = SignalActor.remote()
    if use_iter:
        imap_iterable = iter([(index, wait_index, signal)
                              for index in range(100)])
    else:
        imap_iterable = [(index, wait_index, signal) for index in range(100)]
    result_iter = pool_4_processes.imap(f, imap_iterable)
    for i in range(100):
        if i == wait_index:
            with pytest.raises(TimeoutError):
                result = result_iter.next(timeout=0.1)
            ray.get(signal.send.remote())

        result = result_iter.next()
        assert result == i

    with pytest.raises(StopIteration):
        result_iter.next()

    wait_index = 23
    signal = SignalActor.remote()
    if use_iter:
        imap_iterable = iter([(index, wait_index, signal)
                              for index in range(100)])
    else:
        imap_iterable = [(index, wait_index, signal) for index in range(100)]
    result_iter = pool_4_processes.imap_unordered(f,
                                                  imap_iterable,
                                                  chunksize=11)
    in_order = []
    for i in range(100):
        try:
            result = result_iter.next(timeout=1)
        except TimeoutError:
            ray.get(signal.send.remote())
            result = result_iter.next()

        in_order.append(result == i)

    # Check that the results didn't come back all in order.
    # NOTE: this could be flaky if the calls happened to finish in order due
    # to the random sleeps, but it's very unlikely.
    assert not all(in_order)

    with pytest.raises(StopIteration):
        result_iter.next()
Exemple #3
0
        def create_actor(self, caller_handle):
            s = SignalActor.remote()
            # Create an actor which depends on an object that can never be
            # resolved.
            actor_handle = Actor.remote(s.wait.remote())

            pid = os.getpid()
            signal_handle = SignalActor.remote()
            caller_handle.call.remote(pid, signal_handle, actor_handle)
            # Wait until the `Caller` start executing the remote `call` method.
            ray.get(signal_handle.wait.remote())
Exemple #4
0
def test_heartbeats_single(ray_start_cluster_head):
    """Unit test for `Cluster.wait_for_nodes`.

    Test proper metrics.
    """
    cluster = ray_start_cluster_head
    if use_gcs_for_bootstrap():
        monitor = setup_monitor(cluster.gcs_address)
    else:
        monitor = setup_monitor(cluster.address)
    total_cpus = ray.state.cluster_resources()["CPU"]
    verify_load_metrics(monitor, ({"CPU": 0.0}, {"CPU": total_cpus}))

    @ray.remote
    def work(signal):
        wait_signal = signal.wait.remote()
        while True:
            ready, not_ready = ray.wait([wait_signal], timeout=0)
            if len(ready) == 1:
                break
            time.sleep(1)

    signal = SignalActor.remote()

    work_handle = work.remote(signal)
    verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus}))

    ray.get(signal.send.remote())
    ray.get(work_handle)

    @ray.remote(num_cpus=1)
    class Actor:
        def work(self, signal):
            wait_signal = signal.wait.remote()
            while True:
                ready, not_ready = ray.wait([wait_signal], timeout=0)
                if len(ready) == 1:
                    break
                time.sleep(1)

    signal = SignalActor.remote()

    test_actor = Actor.remote()
    work_handle = test_actor.work.remote(signal)
    time.sleep(1)  # Time for actor to get placed and the method to start.

    verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus}))

    ray.get(signal.send.remote())
    ray.get(work_handle)
    del monitor
Exemple #5
0
def test_basic_serialized_reference(one_worker_100MiB, use_ray_put, failure):
    @ray.remote(max_retries=1)
    def pending(ref, dep):
        ray.get(ref[0])
        if failure:
            os._exit(0)

    array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    signal = SignalActor.remote()
    obj_ref = pending.remote([array_oid], signal.wait.remote())

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(array_oid_bytes)

    # Fulfill the dependency, causing the task to finish.
    ray.get(signal.send.remote())
    try:
        ray.get(obj_ref)
        assert not failure
    except ray.exceptions.WorkerCrashedError:
        assert failure

    # Reference should be gone, check that array gets evicted.
    _fill_object_store_and_get(array_oid_bytes, succeed=False)
Exemple #6
0
def test_reconfigure_with_queries(serve_instance):
    signal = SignalActor.remote()

    @serve.deployment(max_concurrent_queries=10, num_replicas=3)
    class A:
        def __init__(self):
            self.state = None

        def reconfigure(self, config):
            self.state = config

        async def __call__(self):
            await signal.wait.remote()
            return self.state["a"]

    A.options(version="1", user_config={"a": 1}).deploy()
    handle = A.get_handle()
    refs = []
    for _ in range(30):
        refs.append(handle.remote())

    @ray.remote(num_cpus=0)
    def reconfigure():
        A.options(version="1", user_config={"a": 2}).deploy()

    reconfigure_ref = reconfigure.remote()
    signal.send.remote()
    ray.get(reconfigure_ref)
    for ref in refs:
        assert ray.get(ref) == 1
    assert ray.get(handle.remote()) == 2
Exemple #7
0
def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put,
                                             failure):
    @ray.remote(max_retries=1)
    def child(dep1, dep2):
        if failure:
            os._exit(0)
        return

    @ray.remote
    def launch_pending_task(ref, signal):
        return child.remote(ref[0], signal.wait.remote())

    signal = SignalActor.remote()

    # Test that the reference held by the actor isn't evicted.
    array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8),
                           use_ray_put)
    child_return_id = ray.get(launch_pending_task.remote([array_oid], signal))

    # Remove the local reference.
    array_oid_bytes = array_oid.binary()
    del array_oid

    # Test that the reference prevents the object from being evicted.
    _fill_object_store_and_get(array_oid_bytes)

    ray.get(signal.send.remote())
    try:
        ray.get(child_return_id)
        assert not failure
    except (ray.exceptions.WorkerCrashedError, ray.exceptions.ObjectLostError):
        assert failure
    del child_return_id

    _fill_object_store_and_get(array_oid_bytes, succeed=False)
Exemple #8
0
def test_healthcheck_timeout(serve_instance):
    # https://github.com/ray-project/ray/issues/24554

    signal = SignalActor.remote()

    @serve.deployment(
        _health_check_timeout_s=2,
        _health_check_period_s=1,
        _graceful_shutdown_timeout_s=0,
    )
    class A:
        def check_health(self):
            return True

        def __call__(self):
            ray.get(signal.wait.remote())

    A.deploy()
    handle = A.get_handle()
    ref = handle.remote()
    # without the proper fix, the ref will fail with actor died error.
    with pytest.raises(GetTimeoutError):
        ray.get(ref, timeout=10)
    signal.send.remote()
    ray.get(ref)
Exemple #9
0
def test_actor_task_fast_fail(ray_start_cluster):
    # Explicitly set `max_task_retries=0` here to show the test scenario.
    @ray.remote(max_restarts=1, max_task_retries=0)
    class SlowActor:
        def __init__(self, signal_actor):
            if ray.get_runtime_context().was_current_actor_reconstructed:
                ray.get(signal_actor.wait.remote())

        def ping(self):
            return "pong"

    signal = SignalActor.remote()
    actor = SlowActor.remote(signal)
    ray.get(actor.ping.remote())
    ray.kill(actor, no_restart=False)

    # Wait for a while so that now the driver knows the actor is in
    # RESTARTING state.
    time.sleep(1)
    # An actor task should fail quickly until the actor is restarted if
    # `max_task_retries` is 0.
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(actor.ping.remote())

    signal.send.remote()
    # Wait for a while so that now the driver knows the actor is in
    # ALIVE state.
    time.sleep(1)
    # An actor task should succeed.
    ray.get(actor.ping.remote())
Exemple #10
0
def test_single_cpu_cancel(shutdown_only, use_force):
    ray.init(num_cpus=1)
    signaler = SignalActor.remote()

    @ray.remote
    def wait_for(t):
        return ray.get(t[0])

    obj1 = wait_for.remote([signaler.wait.remote()])
    obj2 = wait_for.remote([obj1])
    obj3 = wait_for.remote([obj2])
    indep = wait_for.remote([signaler.wait.remote()])

    assert len(ray.wait([obj3], timeout=0.1)[0]) == 0
    ray.cancel(obj3, force=use_force)
    with pytest.raises(valid_exceptions(use_force)):
        ray.get(obj3)

    ray.cancel(obj1, force=use_force)

    for d in [obj1, obj2]:
        with pytest.raises(valid_exceptions(use_force)):
            ray.get(d)

    signaler.send.remote()
    ray.get(indep)
Exemple #11
0
def test_step_resources(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")
    # We use signal actor here because we can't guarantee the order of tasks
    # sent from worker to raylet.
    signal_actor = SignalActor.remote()

    @workflow.step
    def step_run():
        ray.wait([signal_actor.send.remote()])
        with FileLock(lock_path):
            return None

    @ray.remote(num_cpus=1)
    def remote_run():
        return None

    lock = FileLock(lock_path)
    lock.acquire()
    ret = step_run.options(num_cpus=2).step().run_async()
    ray.wait([signal_actor.wait.remote()])
    obj = remote_run.remote()
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get(obj, timeout=2)
    lock.release()
    assert ray.get(ret) is None
    assert ray.get(obj) is None
Exemple #12
0
def test_fastapiwrapper_constructor_before_startup_hooks(serve_instance):
    """
    Tests that the class constructor is called before the startup hooks
    are run in FastAPIWrapper. SignalActor event is set from a startup hook
    and is awaited in the class constructor. If the class constructor is run
    before the startup hooks, the SignalActor event will time out while waiting
    and the test will pass.
    """
    app = FastAPI()
    signal = SignalActor.remote()

    @app.on_event("startup")
    def startup_event():
        ray.get(signal.send.remote())

    @serve.deployment(route_prefix="/")
    @serve.ingress(app)
    class TestDeployment:
        def __init__(self):
            self.test_passed = False
            try:
                ray.get(signal.wait.remote(), timeout=.1)
                self.test_passed = False
            except GetTimeoutError:
                self.test_passed = True

        @app.get("/")
        def root(self):
            return self.test_passed

    TestDeployment.deploy()
    resp = requests.get("http://localhost:8000/")
    assert resp.json()
Exemple #13
0
async def test_job_runs_with_no_resources_available(job_manager):
    script_path = _driver_script_path("consume_one_cpu.py")

    hang_signal_actor = SignalActor.remote()

    @ray.remote(num_cpus=ray.available_resources()["CPU"])
    def consume_all_cpus():
        ray.get(hang_signal_actor.wait.remote())

    # Start a hanging task that consumes all CPUs.
    hanging_ref = consume_all_cpus.remote()

    try:
        # Check that the job starts up properly even with no CPUs available.
        # The job won't exit until it has a CPU available because it waits for
        # a task.
        job_id = job_manager.submit_job(entrypoint=f"python {script_path}")
        await async_wait_for_condition(check_job_running,
                                       job_manager=job_manager,
                                       job_id=job_id)
        await async_wait_for_condition(
            lambda: "Hanging..." in job_manager.get_job_logs(job_id))

        # Signal the hanging task to exit and release its CPUs.
        ray.get(hang_signal_actor.send.remote())

        # Check the job succeeds now that resources are available.
        await async_wait_for_condition(check_job_succeeded,
                                       job_manager=job_manager,
                                       job_id=job_id)
        await async_wait_for_condition(
            lambda: "Success!" in job_manager.get_job_logs(job_id))
    finally:
        # Just in case the test fails.
        ray.cancel(hanging_ref)
Exemple #14
0
def test_comprehensive(ray_start_regular, use_force):
    signaler = SignalActor.remote()

    @ray.remote
    def wait_for(t):
        ray.get(t[0])
        return "Result"

    @ray.remote
    def combine(a, b):
        return str(a) + str(b)

    a = wait_for.remote([signaler.wait.remote()])
    b = wait_for.remote([signaler.wait.remote()])
    combo = combine.remote(a, b)
    a2 = wait_for.remote([a])

    assert len(ray.wait([a, b, a2, combo], timeout=1)[0]) == 0

    ray.cancel(a, force=use_force)
    with pytest.raises(valid_exceptions(use_force)):
        ray.get(a, timeout=10)

    with pytest.raises(valid_exceptions(use_force)):
        ray.get(a2, timeout=40)

    signaler.send.remote()

    with pytest.raises(valid_exceptions(use_force)):
        ray.get(combo)
Exemple #15
0
def test_deleted_actor_no_restart(ray_start_regular):
    @ray.remote(resources={"actor": 1}, max_restarts=3)
    class Actor:
        def method(self):
            return 1

        def getpid(self):
            return os.getpid()

    @ray.remote
    def f(actor, signal):
        ray.get(signal.wait.remote())
        return ray.get(actor.method.remote())

    signal = SignalActor.remote()
    a = Actor.remote()
    pid = ray.get(a.getpid.remote())
    # Pass the handle to another task that cannot run yet.
    x_id = f.remote(a, signal)
    # Delete the original handle. The actor should not get killed yet.
    del a

    # Once the task finishes, the actor process should get killed.
    ray.get(signal.send.remote())
    assert ray.get(x_id) == 1
    wait_for_pid_to_exit(pid)

    # Create another actor with the same resource requirement to make sure the
    # old one was not restarted.
    a = Actor.remote()
    pid = ray.get(a.getpid.remote())
Exemple #16
0
def test_fetch_local(ray_start_cluster_head):
    cluster = ray_start_cluster_head
    cluster.add_node(num_cpus=2, object_store_memory=75 * 1024 * 1024)

    signal_actor = SignalActor.remote()

    @ray.remote
    def put():
        ray.wait([signal_actor.wait.remote()])
        return np.random.rand(5 * 1024 * 1024)  # 40 MB data

    local_ref = ray.put(np.random.rand(5 * 1024 * 1024))
    remote_ref = put.remote()
    # Data is not ready in any node
    (ready_ref, remaining_ref) = ray.wait([remote_ref],
                                          timeout=2,
                                          fetch_local=False)
    assert (0, 1) == (len(ready_ref), len(remaining_ref))
    ray.wait([signal_actor.send.remote()])

    # Data is ready in some node, but not local node.
    (ready_ref, remaining_ref) = ray.wait([remote_ref], fetch_local=False)
    assert (1, 0) == (len(ready_ref), len(remaining_ref))
    (ready_ref, remaining_ref) = ray.wait([remote_ref],
                                          timeout=2,
                                          fetch_local=True)
    assert (0, 1) == (len(ready_ref), len(remaining_ref))
    del local_ref
    (ready_ref, remaining_ref) = ray.wait([remote_ref], fetch_local=True)
    assert (1, 0) == (len(ready_ref), len(remaining_ref))
Exemple #17
0
def test_pull_manager_at_capacity_reports(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=0, object_store_memory=int(1e8))
    ray.init(address=cluster.address)
    cluster.add_node(num_cpus=1, object_store_memory=int(1e8))

    object_size = int(1e7)
    refs = []
    for _ in range(20):
        refs.append(ray.put(np.zeros(object_size, dtype=np.uint8)))

    def fetches_queued():
        return "fetches queued" in memory_summary(stats_only=True)

    assert not fetches_queued()

    @ray.remote
    def f(s, ref):
        ray.get(s.wait.remote())

    signal = SignalActor.remote()
    xs = [f.remote(signal, ref) for ref in refs]
    wait_for_condition(fetches_queued)

    signal.send.remote()
    ray.get(xs)
    wait_for_condition(lambda: not fetches_queued())
Exemple #18
0
def test_cancel_multiple_dependents(ray_start_regular, use_force):
    signaler = SignalActor.remote()

    @ray.remote
    def wait_for(t):
        return ray.get(t[0])

    head = wait_for.remote([signaler.wait.remote()])
    deps = []
    for _ in range(3):
        deps.append(wait_for.remote([head]))

    assert len(ray.wait([head], timeout=0.1)[0]) == 0
    ray.cancel(head, force=use_force)
    for d in deps:
        with pytest.raises(valid_exceptions(use_force)):
            ray.get(d)

    head2 = wait_for.remote([signaler.wait.remote()])

    deps2 = []
    for _ in range(3):
        deps2.append(wait_for.remote([head]))

    for d in deps2:
        ray.cancel(d, force=use_force)

    for d in deps2:
        with pytest.raises(valid_exceptions(use_force)):
            ray.get(d)

    signaler.send.remote()
    ray.get(head2)
Exemple #19
0
    async def test_successful_job(self, job_manager):
        """Test tailing logs for a PENDING -> RUNNING -> SUCCESSFUL job."""
        start_signal_actor = SignalActor.remote()

        with tempfile.TemporaryDirectory() as tmp_dir:
            _, tmp_file, job_id = await _run_hanging_command(
                job_manager, tmp_dir, start_signal_actor=start_signal_actor)

            # TODO(edoakes): check we get no logs before actor starts (not sure
            # how to timeout the iterator call).
            assert job_manager.get_job_status(job_id) == JobStatus.PENDING

            # Signal job to start.
            ray.get(start_signal_actor.send.remote())

            await self._tail_and_assert_logs(job_id,
                                             job_manager,
                                             expected_log="Waiting...",
                                             num_iteration=5)

            # Signal the job to exit by writing to the file.
            with open(tmp_file, "w") as f:
                print("hello", file=f)

            async for lines in job_manager.tail_job_logs(job_id):
                assert all(s == "Waiting..."
                           for s in lines.strip().split("\n"))
                print(lines, end="")

            await async_wait_for_condition(check_job_succeeded,
                                           job_manager=job_manager,
                                           job_id=job_id)
Exemple #20
0
def _setup_cluster_for_test(ray_start_cluster):
    NUM_NODES = 2
    cluster = ray_start_cluster
    # Add a head node.
    cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
    # Add worker nodes.
    [cluster.add_node() for _ in range(NUM_NODES - 1)]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    worker_should_exit = SignalActor.remote()

    # Generate a metric in the driver.
    counter = Counter("test_driver_counter", description="desc")
    counter.inc()

    # Generate some metrics from actor & tasks.
    @ray.remote
    def f():
        counter = Counter("test_counter", description="desc")
        counter.inc()
        counter = ray.get(ray.put(counter))  # Test serialization.
        counter.inc()
        counter.inc(2)
        ray.get(worker_should_exit.wait.remote())

    # Generate some metrics for the placement group.
    pg = ray.util.placement_group(bundles=[{"CPU": 1}])
    ray.get(pg.ready())
    print(ray.util.placement_group_table())
    ray.util.remove_placement_group(pg)

    @ray.remote
    class A:
        async def ping(self):
            histogram = Histogram("test_histogram",
                                  description="desc",
                                  boundaries=[0.1, 1.6])
            histogram = ray.get(ray.put(histogram))  # Test serialization.
            histogram.record(1.5)
            ray.get(worker_should_exit.wait.remote())

    a = A.remote()
    obj_refs = [f.remote(), a.ping.remote()]

    node_info_list = ray.nodes()
    prom_addresses = []
    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        addr = node_info["NodeManagerAddress"]
        prom_addresses.append(f"{addr}:{metrics_export_port}")
    autoscaler_export_addr = "{}:{}".format(cluster.head_node.node_ip_address,
                                            AUTOSCALER_METRIC_PORT)
    yield prom_addresses, autoscaler_export_addr

    ray.get(worker_should_exit.send.remote())
    ray.get(obj_refs)
    ray.shutdown()
    cluster.shutdown()
Exemple #21
0
def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
    @ray.remote(num_cpus=0)
    class Counter:
        def __init__(self):
            self.count = 0
            self.event = asyncio.Event()

        def increment(self):
            self.count += 1
            self.event.set()

        async def wait_for_count(self, count):
            while True:
                if self.count >= count:
                    return
                await self.event.wait()
                self.event.clear()

    @ray.remote
    class ActorToKill:
        def __init__(self, counter):
            counter.increment.remote()

        def run(self, counter, signal):
            counter.increment.remote()
            ray.get(signal.wait.remote())

    @ray.remote
    class CallingActor:
        def __init__(self):
            self.actor = ray.get_actor("a")

        def call_other(self, counter, signal):
            return ray.get(self.actor.run.remote(counter, signal))

    init_counter = Counter.remote()
    run_counter = Counter.remote()
    signal = SignalActor.remote()

    # Start the two actors, wait for ActorToKill's constructor to run.
    a = ActorToKill.options(name="a", max_restarts=-1, max_task_retries=-1).remote(
        init_counter
    )
    c = CallingActor.remote()
    ray.get(init_counter.wait_for_count.remote(1), timeout=30)

    # Signal the CallingActor to call ActorToKill, wait for it to be running,
    # then kill ActorToKill.
    # Verify that this causes ActorToKill's constructor to run a second time
    # and the run method to begin a second time.
    ref = c.call_other.remote(run_counter, signal)
    ray.get(run_counter.wait_for_count.remote(1), timeout=30)
    ray.kill(a, no_restart=False)
    ray.get(init_counter.wait_for_count.remote(2), timeout=30)
    ray.get(run_counter.wait_for_count.remote(2), timeout=30)

    # Signal the run method to finish, verify that the CallingActor returns.
    signal.send.remote()
    ray.get(ref, timeout=30)
def test_e2e_bursty(serve_instance):
    """
    Sends 100 requests in bursts. Uses delays for smooth provisioning.
    """

    signal = SignalActor.remote()

    @serve.deployment(
        _autoscaling_config={
            "metrics_interval_s": 0.1,
            "min_replicas": 1,
            "max_replicas": 2,
            "look_back_period_s": 0.2,
            "downscale_delay_s": 0.2,
            "upscale_delay_s": 0.2,
        },
        # We will send over a lot of queries. This will make sure replicas are
        # killed quickly during cleanup.
        _graceful_shutdown_timeout_s=1,
        max_concurrent_queries=1000,
        version="v1",
    )
    class A:
        def __call__(self):
            ray.get(signal.wait.remote())

    A.deploy()

    controller = serve_instance._controller
    start_time = get_deployment_start_time(controller, A)

    handle = A.get_handle()
    [handle.remote() for _ in range(100)]

    wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 2)
    num_replicas = get_num_running_replicas(controller, A)
    signal.send.remote()

    # Execute a bursty workload that issues 100 requests every 0.05 seconds
    # The SignalActor allows all requests in a burst to be queued before they
    # are all executed, which increases the
    # target_in_flight_requests_per_replica. Then the send method will bring
    # it back to 0. This bursty behavior should be smoothed by the delay
    # parameters.
    for _ in range(5):
        time.sleep(0.05)
        assert get_num_running_replicas(controller, A) == num_replicas
        [handle.remote() for _ in range(100)]
        signal.send.remote()

    # As the queue is drained, we should scale back down.
    wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 1)

    # Make sure start time did not change for the deployment
    assert get_deployment_start_time(controller, A) == start_time
def test_recursively_pass_returned_object_ref(one_worker_100MiB, use_ray_put,
                                              failure):
    @ray.remote
    def return_an_id():
        return put_object(
            np.zeros(20 * 1024 * 1024, dtype=np.uint8), use_ray_put)

    @ray.remote(max_retries=1)
    def recursive(ref, signal, max_depth, depth=0):
        inner_id = ray.get(ref[0])
        if depth == max_depth:
            ray.get(signal.wait.remote())
            if failure:
                os._exit(0)
            return inner_id
        else:
            return inner_id, recursive.remote(ref, signal, max_depth,
                                              depth + 1)

    max_depth = 5
    outer_oid = return_an_id.remote()
    signal = SignalActor.remote()
    head_oid = recursive.remote([outer_oid], signal, max_depth)

    # Remove the local reference.
    inner_oid = None
    outer_oid = head_oid
    for i in range(max_depth):
        inner_oid, outer_oid = ray.get(outer_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(outer_oid, succeed=False)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())

    try:
        # Check that the remote reference pins the object.
        ray.get(outer_oid)
        _fill_object_store_and_get(inner_oid)
        assert not failure
    except ray.exceptions.OwnerDiedError:
        # There is only 1 core, so the same worker will execute all `recursive`
        # tasks. Therefore, if we kill the worker during the last task, its
        # owner (the worker that executed the second-to-last task) will also
        # have died.
        assert failure

    inner_oid_bytes = inner_oid.binary()
    del inner_oid
    del head_oid
    del outer_oid

    # Reference should be gone, check that returned ID gets evicted.
    _fill_object_store_and_get(inner_oid_bytes, succeed=False)
Exemple #24
0
def test_schedule_actor_and_normal_task(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(
        memory=1024 ** 3, _system_config={"gcs_actor_scheduling_enabled": True}
    )
    ray.init(address=cluster.address)
    cluster.wait_for_nodes()

    @ray.remote(memory=600 * 1024 ** 2, num_cpus=0.01)
    class Foo:
        def method(self):
            return 2

    @ray.remote(memory=600 * 1024 ** 2, num_cpus=0.01)
    def fun(singal1, signal_actor2):
        signal_actor2.send.remote()
        ray.get(singal1.wait.remote())
        return 1

    singal1 = SignalActor.remote()
    signal2 = SignalActor.remote()

    o1 = fun.remote(singal1, signal2)
    # Make sure the normal task is executing.
    ray.get(signal2.wait.remote())

    # The normal task is blocked now.
    # Try to create actor and make sure this actor is not created for the time
    # being.
    foo = Foo.remote()
    o2 = foo.method.remote()
    ready_list, remaining_list = ray.wait([o2], timeout=2)
    assert len(ready_list) == 0 and len(remaining_list) == 1

    # Send a signal to unblock the normal task execution.
    ray.get(singal1.send.remote())

    # Check the result of normal task.
    assert ray.get(o1) == 1

    # Make sure the actor is created.
    assert ray.get(o2) == 2
Exemple #25
0
def test_get_throws_quickly_when_found_exception(ray_start_regular):
    # We use an actor instead of functions here. If we use functions, it's
    # very likely that two normal tasks are submitted before the first worker
    # is registered to Raylet. Since `maximum_startup_concurrency` is 1,
    # the worker pool will wait for the registration of the first worker
    # and skip starting new workers. The result is, the two tasks will be
    # executed sequentially, which breaks an assumption of this test case -
    # the two tasks run in parallel.
    @ray.remote
    class Actor(object):
        def bad_func1(self):
            raise Exception("Test function intentionally failed.")

        def bad_func2(self):
            os._exit(0)

        def slow_func(self, signal):
            ray.get(signal.wait.remote())

    def expect_exception(objects, exception):
        with pytest.raises(ray.exceptions.RayError) as err:
            ray.get(objects)
        assert err.type is exception

    signal1 = SignalActor.remote()
    actor = Actor.options(max_concurrency=2).remote()
    expect_exception(
        [actor.bad_func1.remote(),
         actor.slow_func.remote(signal1)],
        ray.exceptions.RayTaskError,
    )
    ray.get(signal1.send.remote())

    signal2 = SignalActor.remote()
    actor = Actor.options(max_concurrency=2).remote()
    expect_exception(
        [actor.bad_func2.remote(),
         actor.slow_func.remote(signal2)],
        ray.exceptions.RayActorError,
    )
    ray.get(signal2.send.remote())
Exemple #26
0
def test_fastapi_shutdown_hook(serve_instance):
    # https://github.com/ray-project/ray/issues/18349
    shutdown_signal = SignalActor.remote()
    del_signal = SignalActor.remote()

    app = FastAPI()

    @app.on_event("shutdown")
    def call_signal():
        shutdown_signal.send.remote()

    @serve.deployment
    @serve.ingress(app)
    class A:
        def __del__(self):
            del_signal.send.remote()

    A.deploy()
    A.delete()
    ray.get(shutdown_signal.wait.remote(), timeout=20)
    ray.get(del_signal.wait.remote(), timeout=20)
def test_e2e_intermediate_downscaling(serve_instance):
    """
    Scales up, then down, and up again.
    """

    signal = SignalActor.remote()

    @serve.deployment(
        _autoscaling_config={
            "metrics_interval_s": 0.1,
            "min_replicas": 1,
            "max_replicas": 20,
            "look_back_period_s": 0.2,
            "downscale_delay_s": 0.2,
            "upscale_delay_s": 0.2,
        },
        # We will send over a lot of queries. This will make sure replicas are
        # killed quickly during cleanup.
        _graceful_shutdown_timeout_s=1,
        max_concurrent_queries=1000,
        version="v1",
    )
    class A:
        def __call__(self):
            ray.get(signal.wait.remote())

    A.deploy()

    controller = serve_instance._controller
    start_time = get_deployment_start_time(controller, A)

    handle = A.get_handle()
    [handle.remote() for _ in range(50)]

    wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 20,
                       timeout=30)
    signal.send.remote()

    wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 1,
                       timeout=30)
    signal.send.remote(clear=True)

    [handle.remote() for _ in range(50)]
    wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 20,
                       timeout=30)

    signal.send.remote()
    # As the queue is drained, we should scale back down.
    wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 1,
                       timeout=30)

    # Make sure start time did not change for the deployment
    assert get_deployment_start_time(controller, A) == start_time
Exemple #28
0
def test_captured_object_ref(one_worker_100MiB):
    captured_id = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8))

    @ray.remote
    def f(signal):
        ray.get(signal.wait.remote())
        ray.get(captured_id)  # noqa: F821

    signal = SignalActor.remote()
    obj_ref = f.remote(signal)

    # Delete local references.
    del f
    del captured_id

    # Test that the captured object ref is pinned despite having no local
    # references.
    ray.get(signal.send.remote())
    _fill_object_store_and_get(obj_ref)

    captured_id = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8))

    @ray.remote
    class Actor:
        def get(self, signal):
            ray.get(signal.wait.remote())
            ray.get(captured_id)  # noqa: F821

    signal = SignalActor.remote()
    actor = Actor.remote()
    obj_ref = actor.get.remote(signal)

    # Delete local references.
    del Actor
    del captured_id

    # Test that the captured object ref is pinned despite having no local
    # references.
    ray.get(signal.send.remote())
    _fill_object_store_and_get(obj_ref)
def test_recursively_pass_returned_object_ref(one_worker_100MiB, use_ray_put,
                                              failure):
    @ray.remote
    def return_an_id():
        return put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8),
                          use_ray_put)

    @ray.remote(max_retries=1)
    def recursive(ref, signal, max_depth, depth=0):
        inner_id = ray.get(ref[0])
        if depth == max_depth:
            ray.get(signal.wait.remote())
            if failure:
                os._exit(0)
            return inner_id
        else:
            return inner_id, recursive.remote(ref, signal, max_depth,
                                              depth + 1)

    max_depth = 5
    outer_oid = return_an_id.remote()
    signal = SignalActor.remote()
    head_oid = recursive.remote([outer_oid], signal, max_depth)

    # Remove the local reference.
    inner_oid = None
    outer_oid = head_oid
    for i in range(max_depth):
        inner_oid, outer_oid = ray.get(outer_oid)

    # Check that the remote reference pins the object.
    _fill_object_store_and_get(outer_oid, succeed=False)

    # Fulfill the dependency, causing the tail task to finish.
    ray.get(signal.send.remote())

    try:
        # Check that the remote reference pins the object.
        ray.get(outer_oid)
        _fill_object_store_and_get(inner_oid)
        assert not failure
    # TODO(edoakes): this should raise WorkerError.
    except ray.exceptions.ObjectLostError:
        assert failure

    inner_oid_bytes = inner_oid.binary()
    del inner_oid
    del head_oid
    del outer_oid

    # Reference should be gone, check that returned ID gets evicted.
    _fill_object_store_and_get(inner_oid_bytes, succeed=False)
Exemple #30
0
def test_cancel_chain(ray_start_regular, use_force):
    signaler = SignalActor.remote()

    @ray.remote
    def wait_for(t):
        return ray.get(t[0])

    obj1 = wait_for.remote([signaler.wait.remote()])
    obj2 = wait_for.remote([obj1])
    obj3 = wait_for.remote([obj2])
    obj4 = wait_for.remote([obj3])

    assert len(ray.wait([obj1], timeout=0.1)[0]) == 0
    ray.cancel(obj1, force=use_force)
    for ob in [obj1, obj2, obj3, obj4]:
        with pytest.raises(valid_exceptions(use_force)):
            ray.get(ob)

    signaler2 = SignalActor.remote()
    obj1 = wait_for.remote([signaler2.wait.remote()])
    obj2 = wait_for.remote([obj1])
    obj3 = wait_for.remote([obj2])
    obj4 = wait_for.remote([obj3])

    assert len(ray.wait([obj3], timeout=0.1)[0]) == 0
    ray.cancel(obj3, force=use_force)
    for ob in [obj3, obj4]:
        with pytest.raises(valid_exceptions(use_force)):
            ray.get(ob)

    with pytest.raises(GetTimeoutError):
        ray.get(obj1, timeout=0.1)

    with pytest.raises(GetTimeoutError):
        ray.get(obj2, timeout=0.1)

    signaler2.send.remote()
    ray.get(obj1)