Ejemplo n.º 1
0
def test_placement_group_task_resource_ids(ray_start_cluster,
                                           connect_to_client):
    @ray.remote(num_cpus=1)
    def f():
        return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 2}])
        o1 = f.options(placement_group=g1).remote()
        resources = ray.get(o1)
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources
        assert "CPU_group_0_" not in list(resources.keys())[0], resources

        # Now retry with a bundle index constraint.
        o1 = f.options(placement_group=g1,
                       placement_group_bundle_index=0).remote()
        resources = ray.get(o1)
        assert len(resources) == 2, resources
        keys = list(resources.keys())
        assert "CPU_group_" in keys[0], resources
        assert "CPU_group_" in keys[1], resources
        assert ("CPU_group_0_" in keys[0]
                or "CPU_group_0_" in keys[1]), resources

        placement_group_assert_no_leak([g1])
Ejemplo n.º 2
0
def test_placement_group_gpu_set(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # One node which only has one CPU.
    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="PACK",
                                                   bundles=[{
                                                       "CPU": 1,
                                                       "GPU": 1
                                                   }, {
                                                       "CPU": 1,
                                                       "GPU": 1
                                                   }])

        @ray.remote(num_gpus=1)
        def get_gpus():
            return ray.get_gpu_ids()

        result = get_gpus.options(placement_group=placement_group,
                                  placement_group_bundle_index=0).remote()
        result = ray.get(result)
        assert result == [0]

        result = get_gpus.options(placement_group=placement_group,
                                  placement_group_bundle_index=1).remote()
        result = ray.get(result)
        assert result == [0]
Ejemplo n.º 3
0
def test_placement_group_hang(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=1)
    def f():
        return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Warm workers up, so that this triggers the hang rice.
        ray.get(f.remote())

        g1 = ray.util.placement_group([{"CPU": 2}])
        # This will start out infeasible. The placement group will then be
        # created and it transitions to feasible.
        o1 = f.options(placement_group=g1).remote()

        resources = ray.get(o1)
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources

        placement_group_assert_no_leak([g1])
Ejemplo n.º 4
0
def test_placement_group_wait(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    [cluster.add_node(num_cpus=2) for _ in range(2)]
    ray.init(address=cluster.address)
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):
        # Wait on placement group that cannot be created.
        placement_group = ray.util.placement_group(
            name="name",
            strategy="SPREAD",
            bundles=[
                {
                    "CPU": 2
                },
                {
                    "CPU": 2
                },
            ],
        )
        ready, unready = ray.wait([placement_group.ready()])
        assert len(unready) == 0
        assert len(ready) == 1
        table = ray.util.placement_group_table(placement_group)
        assert table["state"] == "CREATED"

        pg = ray.get(placement_group.ready())
        assert pg.bundle_specs == placement_group.bundle_specs
        assert pg.id.binary() == placement_group.id.binary()
Ejemplo n.º 5
0
def test_check_bundle_index(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="SPREAD",
                                                   bundles=[{
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }])

        with pytest.raises(ValueError, match="bundle index 3 is invalid"):
            Actor.options(placement_group=placement_group,
                          placement_group_bundle_index=3).remote()

        with pytest.raises(ValueError, match="bundle index -2 is invalid"):
            Actor.options(placement_group=placement_group,
                          placement_group_bundle_index=-2).remote()

        with pytest.raises(ValueError, match="bundle index must be -1"):
            Actor.options(placement_group_bundle_index=0).remote()

        placement_group_assert_no_leak([placement_group])
Ejemplo n.º 6
0
def test_placement_group_synchronous_registration(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # One node which only has one CPU.
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Create a placement group that has two bundles and `STRICT_PACK`
        # strategy so its registration will successful but scheduling failed.
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_PACK",
            bundles=[
                {
                    "CPU": 1,
                },
                {"CPU": 1},
            ],
        )
        # Make sure we can properly remove it immediately
        # as its registration is synchronous.
        ray.util.remove_placement_group(placement_group)

        wait_for_condition(lambda: is_placement_group_removed(placement_group))
Ejemplo n.º 7
0
def test_pending_placement_group_wait(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    [cluster.add_node(num_cpus=2) for _ in range(1)]
    ray.init(address=cluster.address)
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):
        # Wait on placement group that cannot be created.
        placement_group = ray.util.placement_group(
            name="name",
            strategy="SPREAD",
            bundles=[
                {
                    "CPU": 2
                },
                {
                    "CPU": 2
                },
                {
                    "GPU": 2
                },
            ],
        )
        ready, unready = ray.wait([placement_group.ready()], timeout=0.1)
        assert len(unready) == 1
        assert len(ready) == 0
        table = ray.util.placement_group_table(placement_group)
        assert table["state"] == "PENDING"
        with pytest.raises(ray.exceptions.GetTimeoutError):
            ray.get(placement_group.ready(), timeout=0.1)
Ejemplo n.º 8
0
def test_placement_group_strict_spread(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 3
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_SPREAD",
            bundles=[{
                "CPU": 2
            }, {
                "CPU": 2
            }, {
                "CPU": 2
            }])
        ray.get(placement_group.ready())
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1).remote()
        actor_3 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())
        ray.get(actor_3.value.remote())

        # Get all actors.
        actor_infos = ray.state.actors()

        # Make sure all actors in counter_list are located in separate nodes.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
        actor_info_3 = actor_infos.get(actor_3._actor_id.hex())

        assert actor_info_1 and actor_info_2 and actor_info_3

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        node_of_actor_3 = actor_info_3["Address"]["NodeID"]
        assert node_of_actor_1 != node_of_actor_2
        assert node_of_actor_1 != node_of_actor_3
        assert node_of_actor_2 != node_of_actor_3

        placement_group_assert_no_leak([placement_group])
Ejemplo n.º 9
0
def test_placement_group_pack(ray_start_cluster, connect_to_client,
                              gcs_actor_scheduling_enabled):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for i in range(num_nodes):
        cluster.add_node(
            num_cpus=4,
            _system_config={
                "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled
            } if i == 0 else {},
        )
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="PACK",
            bundles=[
                {
                    "CPU": 2,
                    "GPU": 0
                },  # Test 0 resource spec doesn't break tests.
                {
                    "CPU": 2
                },
            ],
        )
        ray.get(placement_group.ready())
        actor_1 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=1).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())

        # Get all actors.
        actor_infos = ray._private.state.actors()

        # Make sure all actors in counter_list are collocated in one node.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

        assert actor_info_1 and actor_info_2

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        assert node_of_actor_1 == node_of_actor_2
        placement_group_assert_no_leak([placement_group])
Ejemplo n.º 10
0
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # Create a head node
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 1,
        },
    )
    ray.init(address=cluster.address)
    for i in range(2):
        cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.worker.global_worker.current_node_id

        worker_node_ids = {
            ray.get(get_node_id.options(resources={
                f"foo:{i}": 1
            }).remote())
            for i in range(2)
        }
        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY)
        def task1():
            internal_kv._internal_kv_put("test_task1", "task1")
            while internal_kv._internal_kv_exists("test_task1"):
                time.sleep(0.1)
            return ray.worker.global_worker.current_node_id

        @ray.remote
        def task2():
            internal_kv._internal_kv_put("test_task2", "task2")
            return ray.worker.global_worker.current_node_id

        locations = []
        locations.append(task1.remote())
        while not internal_kv._internal_kv_exists("test_task1"):
            time.sleep(0.1)
        # Wait for updating driver raylet's resource view.
        time.sleep(5)
        locations.append(
            task2.options(
                scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote())
        while not internal_kv._internal_kv_exists("test_task2"):
            time.sleep(0.1)
        internal_kv._internal_kv_del("test_task1")
        internal_kv._internal_kv_del("test_task2")
        assert set(ray.get(locations)) == worker_node_ids
Ejemplo n.º 11
0
def test_placement_group_strict_pack(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_PACK",
            bundles=[
                {
                    "memory": 50 * 1024 *
                    1024,  # Test memory resource spec doesn't break tests.
                    "CPU": 2
                },
                {
                    "CPU": 2
                }
            ])
        ray.get(placement_group.ready())
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())

        # Get all actors.
        actor_infos = ray.state.actors()

        # Make sure all actors in counter_list are collocated in one node.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

        assert actor_info_1 and actor_info_2

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        assert node_of_actor_1 == node_of_actor_2

        placement_group_assert_no_leak([placement_group])
Ejemplo n.º 12
0
def test_client_context_manager(ray_start_regular_shared, connect_to_client):
    import ray
    with connect_to_client_or_not(connect_to_client):
        if connect_to_client:
            # Client mode is on.
            assert client_mode_should_convert(auto_init=True)
            # We're connected to Ray client.
            assert ray.util.client.ray.is_connected()
        else:
            assert not client_mode_should_convert(auto_init=True)
            assert not ray.util.client.ray.is_connected()
Ejemplo n.º 13
0
def test_placement_group_spread(ray_start_cluster, connect_to_client,
                                gcs_actor_scheduling_enabled):
    @ray.remote
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for i in range(num_nodes):
        cluster.add_node(
            num_cpus=4,
            _system_config={
                "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled
            } if i == 0 else {},
        )
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_SPREAD",
            bundles=[{
                "CPU": 2
            }, {
                "CPU": 2
            }],
        )
        ray.get(placement_group.ready())
        actors = [
            Actor.options(
                placement_group=placement_group,
                placement_group_bundle_index=i,
                num_cpus=2,
            ).remote() for i in range(num_nodes)
        ]

        [ray.get(actor.value.remote()) for actor in actors]

        # Get all actors.
        actor_infos = ray._private.state.actors()

        # Make sure all actors in counter_list are located in separate nodes.
        actor_info_objs = [
            actor_infos.get(actor._actor_id.hex()) for actor in actors
        ]
        assert are_pairwise_unique(
            [info_obj["Address"]["NodeID"] for info_obj in actor_info_objs])

        placement_group_assert_no_leak([placement_group])
Ejemplo n.º 14
0
def test_cuda_visible_devices(ray_start_cluster, connect_to_client):
    @ray.remote(num_gpus=1)
    def f():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_gpus=1)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
        o1 = f.options(placement_group=g1).remote()

        devices = ray.get(o1)
        assert devices == "0", devices
Ejemplo n.º 15
0
def test_remove_pending_placement_group(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Create a placement group that cannot be scheduled now.
        placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}])
        ray.util.remove_placement_group(placement_group)

        # TODO(sang): Add state check here.
        @ray.remote(num_cpus=4)
        def f():
            return 3

        # Make sure this task is still schedulable.
        assert ray.get(f.remote()) == 3
Ejemplo n.º 16
0
def test_placement_group_actor_resource_ids(ray_start_cluster,
                                            connect_to_client):
    @ray.remote(num_cpus=1)
    class F:
        def f(self):
            return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 2}])
        a1 = F.options(placement_group=g1).remote()
        resources = ray.get(a1.f.remote())
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources
Ejemplo n.º 17
0
def test_schedule_placement_group_when_node_add(ray_start_cluster,
                                                connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Creating a placement group that cannot be satisfied yet.
        placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}])

        def is_placement_group_created():
            table = ray.util.placement_group_table(placement_group)
            if "state" not in table:
                return False
            return table["state"] == "CREATED"

        # Add a node that has GPU.
        cluster.add_node(num_cpus=4, num_gpus=4)

        # Make sure the placement group is created.
        wait_for_condition(is_placement_group_created)
Ejemplo n.º 18
0
def test_placement_ready(ray_start_regular, connect_to_client):
    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def v(self):
            return 10

    # bundle is placement group reserved resources and can't be used in bundles
    with pytest.raises(Exception):
        ray.util.placement_group(bundles=[{"bundle": 1}])
    # This test is to test the case that even there all resource in the
    # bundle got allocated, we are still able to return from ready[I
    # since ready use 0 CPU
    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group(bundles=[{"CPU": 1}])
        ray.get(pg.ready())
        a = Actor.options(num_cpus=1, placement_group=pg).remote()
        ray.get(a.v.remote())
        ray.get(pg.ready())
Ejemplo n.º 19
0
def test_schedule_placement_groups_at_the_same_time(connect_to_client):
    ray.init(num_cpus=4)

    with connect_to_client_or_not(connect_to_client):
        pgs = [placement_group([{"CPU": 2}]) for _ in range(6)]

        wait_pgs = {pg.ready(): pg for pg in pgs}

        def is_all_placement_group_removed():
            ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5)
            if ready:
                ready_pg = wait_pgs[ready[0]]
                remove_placement_group(ready_pg)
                del wait_pgs[ready[0]]

            if len(wait_pgs) == 0:
                return True
            return False

        wait_for_condition(is_all_placement_group_removed)

    ray.shutdown()
Ejemplo n.º 20
0
def test_placement_group_gpu_assigned(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_gpus=2)
    ray.init(address=cluster.address)
    gpu_ids_res = set()

    @ray.remote(num_gpus=1, num_cpus=0)
    def f():
        import os
        return os.environ["CUDA_VISIBLE_DEVICES"]

    with connect_to_client_or_not(connect_to_client):
        pg1 = ray.util.placement_group([{"GPU": 1}])
        pg2 = ray.util.placement_group([{"GPU": 1}])

        assert pg1.wait(10)
        assert pg2.wait(10)

        gpu_ids_res.add(ray.get(f.options(placement_group=pg1).remote()))
        gpu_ids_res.add(ray.get(f.options(placement_group=pg2).remote()))

        assert len(gpu_ids_res) == 2
Ejemplo n.º 21
0
def test_capture_child_tasks(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_tasks = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_tasks, num_gpus=total_num_tasks)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group(
            [
                {
                    "CPU": 2,
                    "GPU": 2,
                },
                {
                    "CPU": 2,
                    "GPU": 2,
                },
            ],
            strategy="STRICT_PACK",
        )
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test if tasks capture child tasks.
        @ray.remote
        def task():
            return get_current_placement_group()

        @ray.remote
        def create_nested_task(child_cpu, child_gpu, set_none=False):
            assert get_current_placement_group() is not None
            kwargs = {
                "num_cpus": child_cpu,
                "num_gpus": child_gpu,
            }
            if set_none:
                kwargs["placement_group"] = None
            return ray.get([task.options(**kwargs).remote() for _ in range(3)])

        t = create_nested_task.options(
            num_cpus=1,
            num_gpus=0,
            placement_group=pg,
            placement_group_capture_child_tasks=True,
        ).remote(1, 0)
        pgs = ray.get(t)
        # Every task should have current placement group because they
        # should be implicitly captured by default.
        assert None not in pgs

        t1 = create_nested_task.options(
            num_cpus=1,
            num_gpus=0,
            placement_group=pg,
            placement_group_capture_child_tasks=True,
        ).remote(1, 0, True)
        pgs = ray.get(t1)
        # Every task should have no placement group since it's set to None.
        # should be implicitly captured by default.
        assert set(pgs) == {None}

        # Test if tasks don't capture child tasks when the option is off.
        t2 = create_nested_task.options(num_cpus=0,
                                        num_gpus=1,
                                        placement_group=pg).remote(0, 1)
        pgs = ray.get(t2)
        # All placement groups should be None since we don't capture child
        # tasks.
        assert not all(pgs)
Ejemplo n.º 22
0
def test_capture_child_actors(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group([{
            "CPU": 2
        }, {
            "CPU": 2
        }],
                                      strategy="STRICT_PACK")
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test actors first.
        @ray.remote(num_cpus=1)
        class NestedActor:
            def ready(self):
                return True

        @ray.remote(num_cpus=1)
        class Actor:
            def __init__(self):
                self.actors = []

            def ready(self):
                return True

            def schedule_nested_actor(self):
                # Make sure we can capture the current placement group.
                assert get_current_placement_group() is not None
                # Actors should be implicitly captured.
                actor = NestedActor.remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

            def schedule_nested_actor_outside_pg(self):
                # Don't use placement group.
                actor = NestedActor.options(placement_group=None).remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

        a = Actor.options(placement_group=pg,
                          placement_group_capture_child_tasks=True).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are scheduled on the same node.
        # (why? The placement group has STRICT_PACK strategy).
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        # Since all node id should be identical, set should be equal to 1.
        assert len(node_id_set) == 1

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Now create an actor, but do not capture the current tasks
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Lastly, make sure when None is specified, actors are not scheduled
        # on the same placement group.
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor_outside_pg.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2
Ejemplo n.º 23
0
def test_mini_integration(ray_start_cluster, connect_to_client):
    # Create bundles as many as number of gpus in the cluster.
    # Do some random work and make sure all resources are properly recovered.

    cluster = ray_start_cluster

    num_nodes = 5
    per_bundle_gpus = 2
    gpu_per_node = 4
    total_gpus = num_nodes * per_bundle_gpus * gpu_per_node
    per_node_gpus = per_bundle_gpus * gpu_per_node

    bundles_per_pg = 2
    total_num_pg = total_gpus // (bundles_per_pg * per_bundle_gpus)

    [
        cluster.add_node(num_cpus=2, num_gpus=per_bundle_gpus * gpu_per_node)
        for _ in range(num_nodes)
    ]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(num_cpus=0, num_gpus=1)
        def random_tasks():
            import time
            import random

            sleep_time = random.uniform(0.1, 0.2)
            time.sleep(sleep_time)
            return True

        pgs = []
        pg_tasks = []
        # total bundle gpu usage = bundles_per_pg*total_num_pg*per_bundle_gpus
        # Note this is half of total
        for index in range(total_num_pg):
            pgs.append(
                ray.util.placement_group(
                    name=f"name{index}",
                    strategy="PACK",
                    bundles=[{
                        "GPU": per_bundle_gpus
                    } for _ in range(bundles_per_pg)],
                ))

        # Schedule tasks.
        for i in range(total_num_pg):
            pg = pgs[i]
            pg_tasks.append([
                random_tasks.options(
                    placement_group=pg,
                    placement_group_bundle_index=bundle_index).remote()
                for bundle_index in range(bundles_per_pg)
            ])

        # Make sure tasks are done and we remove placement groups.
        num_removed_pg = 0
        pg_indexes = [2, 3, 1, 7, 8, 9, 0, 6, 4, 5]
        while num_removed_pg < total_num_pg:
            index = pg_indexes[num_removed_pg]
            pg = pgs[index]
            assert all(ray.get(pg_tasks[index]))
            ray.util.remove_placement_group(pg)
            num_removed_pg += 1

        @ray.remote(num_cpus=2, num_gpus=per_node_gpus)
        class A:
            def ping(self):
                return True

        # Make sure all resources are properly returned by scheduling
        # actors that take up all existing resources.
        actors = [A.remote() for _ in range(num_nodes)]
        assert all(ray.get([a.ping.remote() for a in actors]))
Ejemplo n.º 24
0
def test_atomic_creation(ray_start_cluster, connect_to_client):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 2

    [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    @ray.remote(num_cpus=3)
    def bothering_task():
        time.sleep(6)
        return True

    with connect_to_client_or_not(connect_to_client):
        # Schedule tasks to fail initial placement group creation.
        tasks = [bothering_task.remote() for _ in range(2)]

        # Make sure the two common task has scheduled.
        def tasks_scheduled():
            return ray.available_resources()["CPU"] == 2.0

        wait_for_condition(tasks_scheduled)

        # Create an actor that will fail bundle scheduling.
        # It is important to use pack strategy to make test less flaky.
        pg = ray.util.placement_group(
            name="name",
            strategy="SPREAD",
            bundles=[{
                "CPU": bundle_cpu_size
            } for _ in range(num_nodes * bundle_per_node)],
        )

        # Create a placement group actor.
        # This shouldn't be scheduled because atomic
        # placement group creation should've failed.
        pg_actor = NormalActor.options(
            placement_group=pg,
            placement_group_bundle_index=num_nodes * bundle_per_node - 1,
        ).remote()

        # Wait on the placement group now. It should be unready
        # because normal actor takes resources that are required
        # for one of bundle creation.
        ready, unready = ray.wait([pg.ready()], timeout=0.5)
        assert len(ready) == 0
        assert len(unready) == 1
        # Wait until all tasks are done.
        assert all(ray.get(tasks))

        # Wait on the placement group creation. Since resources are now
        # available, it should be ready soon.
        ready, unready = ray.wait([pg.ready()])
        assert len(ready) == 1
        assert len(unready) == 0

        # Confirm that the placement group actor is created. It will
        # raise an exception if actor was scheduled before placement
        # group was created thus it checks atomicity.
        ray.get(pg_actor.ping.remote(), timeout=3.0)
        ray.kill(pg_actor)

        # Make sure atomic creation failure didn't impact resources.
        @ray.remote(num_cpus=bundle_cpu_size)
        def resource_check():
            return True

        # This should hang because every resources
        # are claimed by placement group.
        check_without_pg = [
            resource_check.remote() for _ in range(bundle_per_node * num_nodes)
        ]

        # This all should scheduled on each bundle.
        check_with_pg = [
            resource_check.options(placement_group=pg,
                                   placement_group_bundle_index=i).remote()
            for i in range(bundle_per_node * num_nodes)
        ]

        # Make sure these are hanging.
        ready, unready = ray.wait(check_without_pg, timeout=0)
        assert len(ready) == 0
        assert len(unready) == bundle_per_node * num_nodes

        # Make sure these are all scheduled.
        assert all(ray.get(check_with_pg))

        ray.util.remove_placement_group(pg)

        def pg_removed():
            return ray.util.placement_group_table(pg)["state"] == "REMOVED"

        wait_for_condition(pg_removed)

        # Make sure check without pgs are all
        # scheduled properly because resources are cleaned up.
        assert all(ray.get(check_without_pg))
Ejemplo n.º 25
0
def test_placement_group_bin_packing_priority(ray_start_cluster,
                                              connect_to_client,
                                              scheduling_strategy):
    @ray.remote
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    def index_to_actor(pg, index):
        if index < 2:
            return Actor.options(placement_group=pg,
                                 placement_group_bundle_index=index,
                                 num_cpus=1).remote()
        elif index < 3:
            return Actor.options(placement_group=pg,
                                 placement_group_bundle_index=index,
                                 num_gpus=1).remote()
        else:
            return Actor.options(
                placement_group=pg,
                placement_group_bundle_index=index,
                object_store_memory=1024 * 1024 * 200,
            ).remote()

    def add_nodes_to_cluster(cluster):
        cluster.add_node(num_cpus=1)
        cluster.add_node(num_cpus=2)
        cluster.add_node(num_gpus=1)
        cluster.add_node(object_store_memory=1024 * 1024 * 250)

    default_bundles = [
        {
            "CPU": 1
        },
        {
            "CPU": 2
        },
        {
            "CPU": 1,
            "GPU": 1
        },
        {
            "CPU": 1,
            "object_store_memory": 1024 * 1024 * 200
        },
    ]

    default_num_nodes = len(default_bundles)
    cluster = ray_start_cluster
    add_nodes_to_cluster(cluster)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy=scheduling_strategy,
            bundles=default_bundles,
        )
        ray.get(placement_group.ready())

        actors = [
            index_to_actor(placement_group, i)
            for i in range(default_num_nodes)
        ]

        [ray.get(actor.value.remote()) for actor in actors]

        # Get all actors.
        actor_infos = ray._private.state.actors()

        # Make sure all actors in counter_list are located in separate nodes.
        actor_info_objs = [
            actor_infos.get(actor._actor_id.hex()) for actor in actors
        ]
        assert are_pairwise_unique(
            [info_obj["Address"]["NodeID"] for info_obj in actor_info_objs])
Ejemplo n.º 26
0
def test_placement_group_reschedule_when_node_dead(ray_start_cluster,
                                                   connect_to_client):
    @ray.remote(num_cpus=1)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, namespace="default_test_namespace")

    # Make sure both head and worker node are alive.
    nodes = ray.nodes()
    assert len(nodes) == 3
    assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="SPREAD",
                                                   bundles=[{
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }])
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_3 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())
        ray.get(actor_3.value.remote())

        cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
        cluster.wait_for_nodes()

        actor_4 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_5 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_6 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_4.value.remote())
        ray.get(actor_5.value.remote())
        ray.get(actor_6.value.remote())
        placement_group_assert_no_leak([placement_group])
        ray.shutdown()
Ejemplo n.º 27
0
def test_placement_group_table(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)
    pgs_created = []

    with connect_to_client_or_not(connect_to_client):
        # Originally placement group creation should be pending because
        # there are no resources.
        name = "name"
        strategy = "PACK"
        bundles = [{"CPU": 2, "GPU": 1}, {"CPU": 2}]
        placement_group = ray.util.placement_group(name=name,
                                                   strategy=strategy,
                                                   bundles=bundles)
        pgs_created.append(placement_group)
        result = ray.util.placement_group_table(placement_group)
        assert result["name"] == name
        assert result["strategy"] == strategy
        for i in range(len(bundles)):
            assert bundles[i] == result["bundles"][i]
        assert result["state"] == "PENDING"

        # Now the placement group should be scheduled.
        cluster.add_node(num_cpus=5, num_gpus=1)
        cluster.wait_for_nodes()

        actor_1 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=0).remote()
        ray.get(actor_1.value.remote())

        result = ray.util.placement_group_table(placement_group)
        assert result["state"] == "CREATED"

        # Add tow more placement group for placement group table test.
        second_strategy = "SPREAD"
        pgs_created.append(
            ray.util.placement_group(name="second_placement_group",
                                     strategy=second_strategy,
                                     bundles=bundles))
        pgs_created.append(
            ray.util.placement_group(name="third_placement_group",
                                     strategy=second_strategy,
                                     bundles=bundles))

        placement_group_table = ray.util.placement_group_table()
        assert len(placement_group_table) == 3

        true_name_set = {
            "name", "second_placement_group", "third_placement_group"
        }
        get_name_set = set()

        for _, placement_group_data in placement_group_table.items():
            get_name_set.add(placement_group_data["name"])

        assert true_name_set == get_name_set

        placement_group_assert_no_leak(pgs_created)
Ejemplo n.º 28
0
def test_placement_group_scheduling_strategy(ray_start_cluster,
                                             connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=8, resources={"head": 1})
    cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1})
    cluster.wait_for_nodes()

    ray.init(address=cluster.address)
    pg = ray.util.placement_group(bundles=[{
        "CPU": 1,
        "GPU": 1
    }, {
        "CPU": 1,
        "GPU": 1
    }])
    ray.get(pg.ready())

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY)
        def get_node_id_1():
            return ray.worker.global_worker.current_node_id

        worker_node_id = ray.get(
            get_node_id_1.options(resources={
                "worker": 1
            }).remote())

        assert ray.get(
            get_node_id_1.options(
                num_cpus=1,
                scheduling_strategy=PlacementGroupSchedulingStrategy(
                    placement_group=pg)).remote()) == worker_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        def get_node_id_2():
            return ray.worker.global_worker.current_node_id

        assert ray.get(get_node_id_2.remote()) == worker_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        class Actor1():
            def get_node_id(self):
                return ray.worker.global_worker.current_node_id

        actor1 = Actor1.remote()
        assert ray.get(actor1.get_node_id.remote()) == worker_node_id

        @ray.remote
        class Actor2():
            def get_node_id(self):
                return ray.worker.global_worker.current_node_id

        actor2 = Actor2.options(
            scheduling_strategy=PlacementGroupSchedulingStrategy(
                placement_group=pg)).remote()
        assert ray.get(actor2.get_node_id.remote()) == worker_node_id

    with pytest.raises(ValueError):

        @ray.remote(scheduling_strategy=PlacementGroupSchedulingStrategy(
            placement_group=pg))
        def func():
            return 0

        func.options(placement_group=pg).remote()

    with pytest.raises(ValueError):

        @ray.remote
        def func():
            return 0

        func.options(scheduling_strategy="XXX").remote()

    with pytest.raises(ValueError):

        @ray.remote
        def func():
            return 0

        func.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
            placement_group=None)).remote()
Ejemplo n.º 29
0
def test_remove_placement_group(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    @ray.remote
    def warmup():
        pass

    # warm up the cluster.
    ray.get([warmup.remote() for _ in range(4)])

    with connect_to_client_or_not(connect_to_client):
        # First try to remove a placement group that doesn't
        # exist. This should not do anything.
        random_group_id = PlacementGroupID.from_random()
        random_placement_group = PlacementGroup(random_group_id)
        for _ in range(3):
            ray.util.remove_placement_group(random_placement_group)

        # Creating a placement group as soon as it is
        # created should work.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        ray.util.remove_placement_group(placement_group)

        wait_for_condition(lambda: is_placement_group_removed(placement_group))

        # # Now let's create a placement group.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        # Create an actor that occupies resources.
        @ray.remote(num_cpus=2)
        class A:
            def f(self):
                return 3

        # Currently, there's no way to prevent
        # tasks to be retried for removed placement group.
        # Set max_retrie=0 for testing.
        # TODO(sang): Handle this edge case.
        @ray.remote(num_cpus=2, max_retries=0)
        def long_running_task():
            print(os.getpid())
            import time

            time.sleep(50)

        # Schedule a long running task and actor.
        task_ref = long_running_task.options(
            placement_group=placement_group).remote()
        a = A.options(placement_group=placement_group).remote()
        assert ray.get(a.f.remote()) == 3

        ray.util.remove_placement_group(placement_group)
        # Subsequent remove request shouldn't do anything.
        for _ in range(3):
            ray.util.remove_placement_group(placement_group)

        # Make sure placement group resources are
        # released and we can schedule this task.
        @ray.remote(num_cpus=4)
        def f():
            return 3

        assert ray.get(f.remote()) == 3
        # Since the placement group is removed,
        # the actor should've been killed.
        # That means this request should fail.
        with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
            ray.get(a.f.remote(), timeout=3.0)
        with pytest.raises(ray.exceptions.WorkerCrashedError):
            ray.get(task_ref)
Ejemplo n.º 30
0
def test_default_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=16,
                     resources={"head": 1},
                     _system_config={"scheduler_spread_threshold": 1})
    cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1})
    cluster.wait_for_nodes()

    ray.init(address=cluster.address)
    pg = ray.util.placement_group(bundles=[{
        "CPU": 1,
        "GPU": 1
    }, {
        "CPU": 1,
        "GPU": 1
    }])
    ray.get(pg.ready())
    ray.get(pg.ready())

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY)
        def get_node_id_1():
            return ray.worker.global_worker.current_node_id

        head_node_id = ray.get(
            get_node_id_1.options(resources={
                "head": 1
            }).remote())
        worker_node_id = ray.get(
            get_node_id_1.options(resources={
                "worker": 1
            }).remote())

        assert ray.get(get_node_id_1.remote()) == head_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        def get_node_id_2():
            return ray.worker.global_worker.current_node_id

        assert ray.get(
            get_node_id_2.options(
                scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).remote()
        ) == head_node_id

        @ray.remote
        def get_node_id_3():
            return ray.worker.global_worker.current_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg,
                        placement_group_capture_child_tasks=True))
        class Actor1():
            def get_node_ids(self):
                return [
                    ray.worker.global_worker.current_node_id,
                    # Use parent's placement group
                    ray.get(get_node_id_3.remote()),
                    ray.get(
                        get_node_id_3.options(
                            scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).
                        remote())
                ]

        actor1 = Actor1.remote()
        assert ray.get(actor1.get_node_ids.remote()) == \
               [worker_node_id, worker_node_id, head_node_id]