Ejemplo n.º 1
0
        def set_count(self, count):
            _internal_kv_put("count", count, True)

    # Verify we can get the object successfully.
    ra = RestartableActor.remote()
    ray.get(ra.f.remote())


@pytest.mark.parametrize("ray_start_regular", [{
    "num_cpus": 2,
    "resources": {
        "a": 1
    }
}],
                         indirect=True)
@pytest.mark.skipif(new_scheduler_enabled(), reason="todo hangs")
def test_pending_actor_removed_by_owner(ray_start_regular):
    # Verify when an owner of pending actors is killed, the actor resources
    # are correctly returned.

    @ray.remote(num_cpus=1, resources={"a": 1})
    class A:
        def __init__(self):
            self.actors = []

        def create_actors(self):
            self.actors = [B.remote() for _ in range(2)]

    @ray.remote(resources={"a": 1})
    class B:
        def ping(self):
Ejemplo n.º 2
0
    dep = one_dep_large.remote(None, signal=signal1)
    check_refcounts({dep: (1, 0)})
    result = one_dep.remote(dep, signal=signal2, fail=True)
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal1.send.remote())
    ray.get(dep, timeout=10)
    # Reference count should remain because the dependency is in plasma.
    check_refcounts({dep: (1, 1), result: (1, 0)})
    ray.get(signal2.send.remote())
    # Reference count should be removed because the task finished.
    check_refcounts({dep: (1, 0), result: (1, 0)})
    del dep, result
    check_refcounts({})


@pytest.mark.skipif(new_scheduler_enabled(), reason="dynamic res todo")
def test_actor_creation_task(ray_start_regular):
    @ray.remote
    def large_object():
        # This will be spilled to plasma.
        return np.zeros(10 * 1024 * 1024, dtype=np.uint8)

    @ray.remote(resources={"init": 1})
    class Actor:
        def __init__(self, dependency):
            return

        def ping(self):
            return

    a = Actor.remote(large_object.remote())
Ejemplo n.º 3
0
    new_scheduler_enabled,
)


@ray.remote
class Increase:
    def method(self, x):
        return x + 2


@ray.remote
def increase(x):
    return x + 1


@pytest.mark.skipif(new_scheduler_enabled(), reason="broken")
@pytest.mark.parametrize("ray_start_regular", [
    generate_system_config_map(num_heartbeats_timeout=20,
                               ping_gcs_rpc_server_max_retries=60)
],
                         indirect=True)
def test_gcs_server_restart(ray_start_regular):
    actor1 = Increase.remote()
    result = ray.get(actor1.method.remote(1))
    assert result == 3

    ray.worker._global_node.kill_gcs_server()
    ray.worker._global_node.start_gcs_server()

    result = ray.get(actor1.method.remote(7))
    assert result == 9
Ejemplo n.º 4
0
    # Run a chain of tasks which exceed `num_cpus` in amount, but the CPU
    # resource requirement is still within `num_cpus`.
    obj = foo.remote(4)
    wait_for_condition(lambda: len(get_workers()) == 4)

    ray.get(obj)
    # After finished the tasks, some workers are killed to keep the total
    # number of workers <= num_cpus.
    wait_for_condition(lambda: len(get_workers()) == 2)

    time.sleep(1)
    # The two remaining workers stay alive forever.
    assert len(get_workers()) == 2


@pytest.mark.skipif(new_scheduler_enabled(), reason="fails")
def test_worker_capping_fifo(shutdown_only):
    # Start 2 initial workers by setting num_cpus to 2.
    info = ray.init(num_cpus=2)
    wait_for_condition(lambda: len(get_workers()) == 2)

    time.sleep(1)

    @ray.remote
    def getpid():
        return os.getpid()

    worker1, worker2 = get_workers()

    if worker1.pid == ray.get(getpid.remote()):
        worker1, worker2 = [worker2, worker1]
Ejemplo n.º 5
0
        print("Counts are {}.".format(counts))
        if (len(names) == num_nodes
                and all(count >= minimum_count for count in counts)):
            break
        attempts += 1
    assert attempts < num_attempts

    # Make sure we can get the results of a bunch of tasks.
    results = []
    for _ in range(1000):
        index = np.random.randint(num_actors)
        results.append(actors[index].get_location.remote())
    ray.get(results)


@pytest.mark.skipif(new_scheduler_enabled(), reason="multi node broken")
def test_actor_lifetime_load_balancing(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=0)
    num_nodes = 3
    for i in range(num_nodes):
        cluster.add_node(num_cpus=1)
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            pass

        def ping(self):
            return
Ejemplo n.º 6
0
    def f():
        time.sleep(0.01)
        return ray.worker.global_worker.node.unique_id

    def local():
        return ray.get(f.remote()) == ray.worker.global_worker.node.unique_id

    # Wait for a worker to get started.
    wait_for_condition(local)

    # Check that we are scheduling locally while there are resources available.
    for i in range(20):
        assert local()


@pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often")
def test_load_balancing_with_dependencies(ray_start_cluster):
    # This test ensures that tasks are being assigned to all raylets in a
    # roughly equal manner even when the tasks have dependencies.
    cluster = ray_start_cluster
    num_nodes = 3
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=1)
    ray.init(address=cluster.address)

    @ray.remote
    def f(x):
        time.sleep(0.010)
        return ray.worker.global_worker.node.unique_id

    # This object will be local to one of the raylets. Make sure
Ejemplo n.º 7
0
            replay_buffer.append(ref)
            solution_buffer.append(arr)

    print("-----------------------------------")
    # randomly sample objects
    for _ in range(1000):
        index = random.choice(list(range(buffer_length)))
        ref = replay_buffer[index]
        solution = solution_buffer[index]
        sample = ray.get(ref, timeout=0)
        assert np.array_equal(sample, solution)


@pytest.mark.skipif(
    platform.system() == "Windows", reason="Failing on Windows.")
@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs")
def test_spill_during_get(object_spilling_config, shutdown_only):
    ray.init(
        num_cpus=4,
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled": True,
            "object_store_full_initial_delay_ms": 100,
            # NOTE(swang): Use infinite retries because the OOM timer can still
            # get accidentally triggered when objects are released too slowly
            # (see github.com/ray-project/ray/issues/12040).
            "object_store_full_max_retries": -1,
            "max_io_workers": 1,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
        },
Ejemplo n.º 8
0
    # Run a chain of tasks which exceed `num_cpus` in amount, but the CPU
    # resource requirement is still within `num_cpus`.
    obj = foo.remote(4)
    wait_for_condition(lambda: len(get_workers()) == 4)

    ray.get(obj)
    # After finished the tasks, some workers are killed to keep the total
    # number of workers <= num_cpus.
    wait_for_condition(lambda: len(get_workers()) == 2)

    time.sleep(1)
    # The two remaining workers stay alive forever.
    assert len(get_workers()) == 2


@pytest.mark.skipif(new_scheduler_enabled(), reason="fails")
def test_worker_capping_fifo(shutdown_only):
    # Start 2 initial workers by setting num_cpus to 2.
    info = ray.init(num_cpus=2)
    wait_for_condition(lambda: len(get_workers()) == 2)

    time.sleep(1)

    @ray.remote
    def getpid():
        return os.getpid()

    worker1, worker2 = get_workers()

    if worker1.pid == ray.get(getpid.remote()):
        worker1, worker2 = [worker2, worker1]
Ejemplo n.º 9
0
    new_scheduler_enabled,
)


@ray.remote
class Increase:
    def method(self, x):
        return x + 2


@ray.remote
def increase(x):
    return x + 1


@pytest.mark.skipif(new_scheduler_enabled(), reason="notimpl")
@pytest.mark.parametrize(
    "ray_start_regular", [
        generate_system_config_map(
            num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
    ],
    indirect=True)
def test_gcs_server_restart(ray_start_regular):
    actor1 = Increase.remote()
    result = ray.get(actor1.method.remote(1))
    assert result == 3

    ray.worker._global_node.kill_gcs_server()
    ray.worker._global_node.start_gcs_server()

    actor2 = Increase.remote()
Ejemplo n.º 10
0
        return 1

    ray.get(f.remote())

    # We should be able to create an actor that requires 0 CPU resources.
    @ray.remote(num_cpus=0)
    class Actor:
        def method(self):
            pass

    a = Actor.remote()
    x = a.method.remote()
    ray.get(x)


@pytest.mark.skipif(new_scheduler_enabled(), reason="zero cpu handling")
def test_zero_cpus_actor(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=0)
    valid_node = cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)

    @ray.remote
    class Foo:
        def method(self):
            return ray.worker.global_worker.node.unique_id

    # Make sure tasks and actors run on the remote raylet.
    a = Foo.remote()
    assert valid_node.unique_id == ray.get(a.method.remote())
Ejemplo n.º 11
0
            if demand.shape == one_cpu_shape:
                one_cpu_found = True
        assert one_cpu_found

        # Check that we differentiate between infeasible and ready tasks.
        for demand in checker.report:
            if resource2 in demand.shape:
                assert demand.num_infeasible_requests_queued > 0
                assert demand.num_ready_requests_queued == 0
            else:
                assert demand.num_ready_requests_queued > 0
                assert demand.num_infeasible_requests_queued == 0
    global_state_accessor.disconnect()


@pytest.mark.skipif(new_scheduler_enabled(),
                    reason="requires placement groups")
def test_placement_group_load_report(ray_start_cluster):
    cluster = ray_start_cluster
    # Add a head node that doesn't have gpu resource.
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)
    global_state_accessor = GlobalStateAccessor(
        cluster.address, ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    global_state_accessor.connect()

    class PgLoadChecker:
        def nothing_is_ready(self):
            resource_usage = self._read_resource_usage()
            if not resource_usage:
                return False