Example #1
0
    def from_dict(pg_dict: dict) -> "PlacementGroup":
        """Instantiate and return a PlacementGroup from its json-serializable
        dict representation.

        Used by Ray Client on server-side to deserialize placement group
        option. See decode_options in util/client/server/server.py.

        Args:
            serializable_form(dict): Dictionary representing a placement group.
        Return:
            A placement group made from the data in the input dict.
        """
        # Validate serialized dict
        assert isinstance(pg_dict, dict)
        assert pg_dict.keys() == {"id", "bundle_cache"}
        # The value associated to key "id" is a hex string.
        assert isinstance(pg_dict["id"], str)
        if pg_dict["bundle_cache"] is not None:
            assert isinstance(pg_dict["bundle_cache"], list)

        # Deserialize and return a Placement Group.
        id_bytes = bytes.fromhex(pg_dict["id"])
        pg_id = PlacementGroupID(id_bytes)
        bundle_cache = pg_dict["bundle_cache"]
        return PlacementGroup(pg_id, bundle_cache)
Example #2
0
def test_remove_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)
    # First try to remove a placement group that doesn't
    # exist. This should not do anything.
    random_placement_group_id = PlacementGroupID.from_random()
    for _ in range(3):
        ray.experimental.remove_placement_group(random_placement_group_id)

    # Creating a placement group as soon as it is
    # created should work.
    pid = ray.experimental.placement_group([{"CPU": 2}, {"CPU": 2}])
    ray.experimental.remove_placement_group(pid)

    def is_placement_group_removed():
        table = ray.experimental.placement_group_table(pid)
        if "state" not in table:
            return False
        return table["state"] == "REMOVED"

    wait_for_condition(is_placement_group_removed)

    # # Now let's create a placement group.
    pid = ray.experimental.placement_group([{"CPU": 2}, {"CPU": 2}])

    # # This is a hack to wait for placement group creation.
    # # TODO(sang): Remove it when wait is implemented.
    @ray.remote(num_cpus=0)
    class A:
        def f(self):
            return 3

    a = A.options(placement_group_id=pid).remote()
    assert ray.get(a.f.remote()) == 3
    ray.experimental.remove_placement_group(pid)
    # # Subsequent remove request shouldn't do anything
    for _ in range(3):
        ray.experimental.remove_placement_group(pid)

    # # Make sure placement group resources are
    # # released and we can schedule this task.
    @ray.remote(num_cpus=4)
    def f():
        return 3

    assert ray.get(f.remote()) == 3
Example #3
0
def get_placement_group(placement_group_name: str) -> PlacementGroup:
    """Get a placement group object with a global name.

    Returns:
        None if can't find a placement group with the given name.
        The placement group object otherwise.
    """
    if not placement_group_name:
        raise ValueError("Please supply a non-empty value to get_placement_group")
    worker = ray._private.worker.global_worker
    worker.check_connected()
    placement_group_info = ray._private.state.state.get_placement_group_by_name(
        placement_group_name, worker.namespace
    )
    if placement_group_info is None:
        raise ValueError(f"Failed to look up actor with name: {placement_group_name}")
    else:
        return PlacementGroup(
            PlacementGroupID(hex_to_binary(placement_group_info["placement_group_id"]))
        )
Example #4
0
def test_placement_group_client_option_serialization():
    """Tests conversion of placement group to json-serializable dict and back.

    Tests conversion
    placement_group -> dict -> placement_group and
    dict -> placement_group -> dict
    with and without non-null bundle cache.
    """

    # Tests conversion from dict to placement group and back.
    def dict_to_pg_to_dict(pg_dict_in):
        pg = PlacementGroup.from_dict(pg_dict_in)
        pg_dict_out = pg.to_dict()
        assert pg_dict_in == pg_dict_out

    # Tests conversion from placement group to dict and back.
    def pg_to_dict_to_pg(pg_in):
        pg_dict = pg_in.to_dict()
        pg_out = PlacementGroup.from_dict(pg_dict)
        assert pg_out.id == pg_in.id
        assert pg_out.bundle_cache == pg_in.bundle_cache

    pg_id = PlacementGroupID(id=bytes(16))
    id_string = bytes(16).hex()
    bundle_cache = [{"CPU": 2}, {"custom_resource": 5}]

    pg_with_bundles = PlacementGroup(id=pg_id, bundle_cache=bundle_cache)
    pg_to_dict_to_pg(pg_with_bundles)

    pg_no_bundles = PlacementGroup(id=pg_id)
    pg_to_dict_to_pg(pg_no_bundles)

    pg_dict_with_bundles = {"id": id_string, "bundle_cache": bundle_cache}
    dict_to_pg_to_dict(pg_dict_with_bundles)

    pg_dict_no_bundles = {"id": id_string, "bundle_cache": None}
    dict_to_pg_to_dict(pg_dict_no_bundles)
def test_remove_placement_group(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    @ray.remote
    def warmup():
        pass

    # warm up the cluster.
    ray.get([warmup.remote() for _ in range(4)])

    with connect_to_client_or_not(connect_to_client):
        # First try to remove a placement group that doesn't
        # exist. This should not do anything.
        random_group_id = PlacementGroupID.from_random()
        random_placement_group = PlacementGroup(random_group_id)
        for _ in range(3):
            ray.util.remove_placement_group(random_placement_group)

        # Creating a placement group as soon as it is
        # created should work.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        ray.util.remove_placement_group(placement_group)

        wait_for_condition(lambda: is_placement_group_removed(placement_group))

        # # Now let's create a placement group.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        # Create an actor that occupies resources.
        @ray.remote(num_cpus=2)
        class A:
            def f(self):
                return 3

        # Currently, there's no way to prevent
        # tasks to be retried for removed placement group.
        # Set max_retrie=0 for testing.
        # TODO(sang): Handle this edge case.
        @ray.remote(num_cpus=2, max_retries=0)
        def long_running_task():
            print(os.getpid())
            import time

            time.sleep(50)

        # Schedule a long running task and actor.
        task_ref = long_running_task.options(
            placement_group=placement_group).remote()
        a = A.options(placement_group=placement_group).remote()
        assert ray.get(a.f.remote()) == 3

        ray.util.remove_placement_group(placement_group)
        # Subsequent remove request shouldn't do anything.
        for _ in range(3):
            ray.util.remove_placement_group(placement_group)

        # Make sure placement group resources are
        # released and we can schedule this task.
        @ray.remote(num_cpus=4)
        def f():
            return 3

        assert ray.get(f.remote()) == 3
        # Since the placement group is removed,
        # the actor should've been killed.
        # That means this request should fail.
        with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
            ray.get(a.f.remote(), timeout=3.0)
        with pytest.raises(ray.exceptions.WorkerCrashedError):
            ray.get(task_ref)
Example #6
0
 def empty() -> "PlacementGroup":
     return PlacementGroup(PlacementGroupID.nil())
Example #7
0
 def empty():
     return PlacementGroup(PlacementGroupID.nil())