Example #1
0
    def testOthersTakingResources(self):
        # Let someone occupy the head node
        pg = placement_group([{"CPU": 4, "GPU": 1}])
        ray.get(pg.ready())
        # We are left with the second node
        assert len(nodes()) == 1
        assert default_device(refresh=True) == "GPU"

        pg = placement_group([{"GPU": 1}])
        ray.get(pg.ready())
        # Default device should be CPU
        assert default_device(refresh=True) == "CPU"
        assert len(nodes()) == 1
Example #2
0
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.

    @ray.remote(num_gpus=1)
    def f():
        pass

    @ray.remote(resources={"Custom": 1})
    class Foo:
        pass

    # This task is infeasible.
    f.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # This actor placement task is infeasible.
    Foo.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # Placement group cannot be made, but no warnings should occur.
    pg = placement_group([{"GPU": 1}], strategy="STRICT_PACK")
    pg.ready()
    f.options(placement_group=pg).remote()

    errors = get_error_message(
        p, 1, ray_constants.INFEASIBLE_TASK_ERROR, timeout=5)
    assert len(errors) == 0, errors
Example #3
0
def _create_placement_group(num_cpus_per_actor, num_actors):
    """
    Create Ray placement group to grab resources.

    Parameters
    ----------
    num_cpus_per_actor : int
        Number of CPUs per actor.
    num_actors : int
        Number of actors.

    Returns
    -------
    ray.util.PlacementGroup
        Placement group with grabbed resources.
    """
    cpu_bundle = {"CPU": num_cpus_per_actor}
    bundles = [cpu_bundle for _ in range(num_actors)]

    pg = placement_group(bundles, strategy="SPREAD")

    ready, _ = ray.wait([pg.ready()], timeout=100)

    if ready is None:
        raise TimeoutError("Placement group creation timeout.")

    return pg
    def pg_launcher(num_pgs_to_create):
        print("Creating pgs")
        pgs = []
        for i in range(num_pgs_to_create):
            pgs.append(placement_group(bundles, strategy="STRICT_SPREAD"))

        pgs_removed = []
        pgs_unremoved = []
        # Randomly choose placement groups to remove.
        if pg_removal:
            print("removing pgs")
        for pg in pgs:
            if random() < 0.5 and pg_removal:
                pgs_removed.append(pg)
            else:
                pgs_unremoved.append(pg)
        print(len(pgs_unremoved))

        tasks = []
        # Randomly schedule tasks or actors on placement groups that
        # are not removed.
        for pg in pgs_unremoved:
            for i in range(num_nodes):
                tasks.append(
                    mock_task.options(placement_group=pg,
                                      placement_group_bundle_index=i).remote())
        # Remove the rest of placement groups.
        if pg_removal:
            for pg in pgs_removed:
                remove_placement_group(pg)
        ray.get(tasks)
        # Since placement groups are scheduled, remove them.
        for pg in pgs_unremoved:
            remove_placement_group(pg)
Example #5
0
def main():
    ray.init(address="auto")

    bundles = [{"CPU": 1, "GPU": 1}]
    bundles += [{"CPU": 1} for _ in range(NUM_CPU_BUNDLES)]

    pg = placement_group(bundles, strategy="PACK")

    ray.get(pg.ready())

    workers = [
        Worker.options(placement_group=pg).remote(i)
        for i in range(NUM_CPU_BUNDLES)
    ]

    trainer = Trainer.options(placement_group=pg).remote(0)

    start = time.time()
    while True:
        ray.get([workers[i].work.remote() for i in range(NUM_CPU_BUNDLES)])
        ray.get(trainer.train.remote())
        end = time.time()
        if end - start > RUNTIME:
            break

    if "TEST_OUTPUT_JSON" in os.environ:
        out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
        results = {}
        json.dump(results, out_file)
Example #6
0
def pg_launcher(pre_created_pgs, num_pgs_to_create):
    pgs = []
    pgs += pre_created_pgs
    for i in range(num_pgs_to_create):
        pgs.append(
            placement_group(bundles, strategy="STRICT_SPREAD", name=str(i)))

    pgs_removed = []
    pgs_unremoved = []
    # Randomly choose placement groups to remove.
    for pg in pgs:
        if random() < .5:
            pgs_removed.append(pg)
        else:
            pgs_unremoved.append(pg)

    # Randomly schedule tasks or actors on placement groups that
    # are not removed.
    for pg in pgs_unremoved:
        # TODO(sang): Comment in this line causes GCS actor management
        # failure. We need to fix it.
        # if random() < .5:
        mock_task.options(placement_group=pg).remote()
        # else:
        #     MockActor.options(placement_group=pg).remote()

    # Remove the rest of placement groups.
    for pg in pgs_removed:
        remove_placement_group(pg)

    ray.get([pg.ready() for pg in pgs_unremoved], timeout=10)
    # Since placement groups are scheduled, remove them.
    for pg in pgs_unremoved:
        remove_placement_group(pg)
Example #7
0
def test_placement_group_removal_leak_regression(ray_start_cluster):
    """Related issue:
        https://github.com/ray-project/ray/issues/19131
    """
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=5)
    ray.init(address=cluster.address)

    TOTAL_CPUS = 8
    bundles = [{"CPU": 1, "GPU": 1}]
    bundles += [{"CPU": 1} for _ in range(TOTAL_CPUS - 1)]

    pg = placement_group(bundles, strategy="PACK")
    # Here, we simulate that the ready task is queued and
    # the new node is up. As soon as the new node is up,
    # the ready task is scheduled.
    # See https://github.com/ray-project/ray/pull/19138
    # for more details about the test.
    o = pg.ready()
    # Add an artificial delay until the new node is up.
    time.sleep(3)
    cluster.add_node(num_cpus=5, num_gpus=1)
    ray.get(o)
    bundle_resource_name = f"bundle_group_{pg.id.hex()}"
    expected_bundle_wildcard_val = TOTAL_CPUS * 1000

    # This should fail if there's a leakage
    # because the bundle resources are never returned properly.
    def check_bundle_leaks():
        bundle_resources = ray.available_resources()[bundle_resource_name]
        return expected_bundle_wildcard_val == bundle_resources

    wait_for_condition(check_bundle_leaks)
Example #8
0
def test_pg_actor_workloads(ray_start_regular_with_external_redis):
    from ray.util.placement_group import placement_group

    bundle1 = {"CPU": 1}
    pg = placement_group([bundle1], strategy="STRICT_PACK")

    ray.get(pg.ready())

    @ray.remote
    class Counter:
        def r(self, v):
            return v

        def pid(self):
            import os

            return os.getpid()

    c = Counter.options(placement_group=pg).remote()
    r = ray.get(c.r.remote(10))
    assert r == 10

    print("GCS is killed")
    pid = ray.get(c.pid.remote())
    ray.worker._global_node.kill_gcs_server()

    assert ray.get(c.r.remote(10)) == 10

    ray.worker._global_node.start_gcs_server()

    for _ in range(100):
        assert pid == ray.get(c.pid.remote())
Example #9
0
def spread_to_all_nodes(f: RemoteFunction):
    nodes = ray.state.nodes()
    resources = [{'CPU': f._num_cpus} for _ in range(len(nodes))]
    pg = placement_group(resources, strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    yield len(nodes), pg
    remove_placement_group(pg)
def test_fractional_resources_handle_correct(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=1000)
    ray.init(address=cluster.address)

    bundles = [{"CPU": 0.01} for _ in range(5)]
    pg = placement_group(bundles, strategy="SPREAD")

    ray.get(pg.ready(), timeout=10)
def test_placement_group_local_resource_view(monkeypatch, ray_start_cluster):
    """Please refer to https://github.com/ray-project/ray/pull/19911
    for more details.
    """
    with monkeypatch.context() as m:
        # Increase broadcasting interval so that node resource will arrive
        # at raylet after local resource all being allocated.
        m.setenv("RAY_raylet_report_resources_period_milliseconds", "2000")
        m.setenv("RAY_grpc_based_resource_broadcast", "true")
        cluster = ray_start_cluster

        cluster.add_node(num_cpus=16, object_store_memory=1e9)
        cluster.wait_for_nodes()
        # We need to init here so that we can make sure it's connecting to
        # the raylet where it only has cpu resources.
        # This is a hacky way to prevent scheduling hanging which will
        # schedule <CPU:1> job to the node with GPU and for <GPU:1, CPU:1> task
        # there is no node has this resource.
        ray.init(address="auto")
        cluster.add_node(num_cpus=16, num_gpus=1)
        cluster.wait_for_nodes()
        NUM_CPU_BUNDLES = 30

        @ray.remote(num_cpus=1)
        class Worker(object):
            def __init__(self, i):
                self.i = i

            def work(self):
                time.sleep(0.1)
                print("work ", self.i)

        @ray.remote(num_cpus=1, num_gpus=1)
        class Trainer(object):
            def __init__(self, i):
                self.i = i

            def train(self):
                time.sleep(0.2)
                print("train ", self.i)

        bundles = [{"CPU": 1, "GPU": 1}]
        bundles += [{"CPU": 1} for _ in range(NUM_CPU_BUNDLES)]
        pg = placement_group(bundles, strategy="PACK")
        ray.get(pg.ready())

        # Local resource will be allocated and here we are to ensure
        # local view is consistent and node resouce updates are discarded
        workers = [
            Worker.options(placement_group=pg).remote(i)
            for i in range(NUM_CPU_BUNDLES)
        ]
        trainer = Trainer.options(placement_group=pg).remote(0)
        ray.get([workers[i].work.remote() for i in range(NUM_CPU_BUNDLES)])
        ray.get(trainer.train.remote())
Example #12
0
def _create_placement_group(num_cpus_per_actor, num_actors):
    cpu_bundle = {"CPU": num_cpus_per_actor}
    bundles = [cpu_bundle for _ in range(num_actors)]

    pg = placement_group(bundles, strategy="SPREAD")

    ready, _ = ray.wait([pg.ready()], timeout=100)

    if ready is None:
        raise TimeoutError("Placement group creation timeout.")

    return pg
def main():
    """Run a long running placement group creation/removal tests.

    This test runs 20 trials first and measure the P50 performance.

    After that it runs trials for a long time and make sure the
    P50 creation/scheduling/removal performance is not regressed
    after the long running job.
    """
    args, _ = parse_script_args()
    NUM_PG_AT_EACH_STAGE = args.num_pgs_stage
    NUM_PENDING_PG = args.num_pending_pgs
    TOTAL_STAGE = args.num_stages

    if args.local:
        ray.init(resources={"custom": 100, "pending": 1})
    else:
        ray.init(address="auto")

    assert ray.cluster_resources()["custom"] >= NUM_PG_AT_EACH_STAGE * 4
    assert ray.cluster_resources()["pending"] >= 1

    # Create pending placement groups.
    pending_pgs = []
    for _ in range(NUM_PENDING_PG):
        # Right now, we don't have infeasible pgs,
        # so this will simulate the pending pgs.
        pending_pgs.append(placement_group([{"pending": 1}], strategy="PACK"))

    (scheduling_perf, removing_perf,
     creation_perf) = run_trial(20, NUM_PG_AT_EACH_STAGE)
    (scheduling_perf_final, removing_perf_final,
     creation_perf_final) = run_trial(TOTAL_STAGE, NUM_PG_AT_EACH_STAGE)

    print(f"Scheduling performance 20 trials: {scheduling_perf}")
    print(
        f"Scheduling performance {TOTAL_STAGE} trials: {scheduling_perf_final}"
    )
    print(f"Removal performance 20 trials: {removing_perf}")
    print(f"Removal performance {TOTAL_STAGE} trials: {removing_perf_final}")
    print(f"Creation performance 20 trials: {creation_perf}")
    print(f"Creation performance {TOTAL_STAGE} trials: {creation_perf_final}")

    assert scheduling_perf["p50_ms"] * 100 > scheduling_perf_final["p50_ms"]
    assert removing_perf["p50_ms"] * 100 > removing_perf_final["p50_ms"]
    assert creation_perf["p50_ms"] * 100 > creation_perf_final["p50_ms"]

    if "TEST_OUTPUT_JSON" in os.environ:
        out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
        results = {}
        json.dump(results, out_file)
Example #14
0
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.

    @ray.remote(num_gpus=1)
    def f():
        pass

    @ray.remote(resources={"Custom": 1})
    class Foo:
        pass

    # This task is infeasible.
    f.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # This actor placement task is infeasible.
    foo = Foo.remote()
    print(foo)
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # Placement group cannot be made, but no warnings should occur.
    total_cpus = ray.cluster_resources()["CPU"]

    # Occupy one cpu by an actor
    @ray.remote(num_cpus=1)
    class A:
        pass

    a = A.remote()
    print(a)

    @ray.remote(num_cpus=total_cpus)
    def g():
        pass

    pg = placement_group([{"CPU": total_cpus}], strategy="STRICT_PACK")
    g.options(placement_group=pg).remote()

    errors = get_error_message(p,
                               1,
                               ray_constants.INFEASIBLE_TASK_ERROR,
                               timeout=5)
    assert len(errors) == 0, errors
Example #15
0
def test_many_placement_groups():
    @ray.remote(num_cpus=1, resources={"node": 0.02})
    def f1():
        sleep(10)
        pass

    @ray.remote(num_cpus=1)
    def f2():
        sleep(10)
        pass

    @ray.remote(resources={"node": 0.02})
    def f3():
        sleep(10)
        pass

    bundle1 = {"node": 0.02, "CPU": 1}
    bundle2 = {"CPU": 1}
    bundle3 = {"node": 0.02}

    pgs = []
    for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"):
        pg = placement_group(bundles=[bundle1, bundle2, bundle3])
        pgs.append(pg)

    for pg in tqdm(pgs, desc="Waiting for pgs to be ready"):
        ray.get(pg.ready())

    refs = []
    for pg in tqdm(pgs, desc="Scheduling tasks"):
        ref1 = f1.options(placement_group=pg).remote()
        ref2 = f2.options(placement_group=pg).remote()
        ref3 = f3.options(placement_group=pg).remote()
        refs.extend([ref1, ref2, ref3])

    for _ in trange(10, desc="Waiting"):
        sleep(1)

    with tqdm() as p_bar:
        while refs:
            done, refs = ray.wait(refs)
            p_bar.update()

    for pg in tqdm(pgs, desc="Cleaning up pgs"):
        remove_placement_group(pg)
Example #16
0
def test_many_placement_groups():
    # @ray.remote(num_cpus=1, resources={"node": 0.02})
    @ray.remote
    class C1:
        def ping(self):
            return "pong"

    # @ray.remote(num_cpus=1)
    @ray.remote
    class C2:
        def ping(self):
            return "pong"

    # @ray.remote(resources={"node": 0.02})
    @ray.remote
    class C3:
        def ping(self):
            return "pong"

    bundle1 = {"node": 0.02, "CPU": 1}
    bundle2 = {"CPU": 1}
    bundle3 = {"node": 0.02}

    pgs = []
    for _ in tqdm.trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"):
        pg = placement_group(bundles=[bundle1, bundle2, bundle3])
        pgs.append(pg)

    for pg in tqdm.tqdm(pgs, desc="Waiting for pgs to be ready"):
        ray.get(pg.ready())

    actors = []
    for pg in tqdm.tqdm(pgs, desc="Scheduling tasks"):
        actors.append(C1.options(placement_group=pg).remote())
        actors.append(C2.options(placement_group=pg).remote())
        actors.append(C3.options(placement_group=pg).remote())

    not_ready = [actor.ping.remote() for actor in actors]
    for _ in tqdm.trange(len(actors)):
        ready, not_ready = ray.wait(not_ready)
        assert ray.get(*ready) == "pong"

    for pg in tqdm.tqdm(pgs, desc="Cleaning up pgs"):
        remove_placement_group(pg)
Example #17
0
def test_schedule_placement_groups_at_the_same_time():
    ray.init(num_cpus=4)

    pgs = [placement_group([{"CPU": 2}]) for _ in range(6)]

    wait_pgs = {pg.ready(): pg for pg in pgs}

    def is_all_placement_group_removed():
        ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5)
        if ready:
            ready_pg = wait_pgs[ready[0]]
            remove_placement_group(ready_pg)
            del wait_pgs[ready[0]]

        if len(wait_pgs) == 0:
            return True
        return False

    wait_for_condition(is_all_placement_group_removed)
def test_placement_group_gpu_unique_assigned(ray_start_cluster,
                                             connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_gpus=4, num_cpus=4)
    ray.init(address=cluster.address)
    gpu_ids_res = set()

    # Create placement group with 4 bundles using 1 GPU each.
    num_gpus = 4
    bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
    pg = placement_group(bundles)
    ray.get(pg.ready())

    # Actor using 1 GPU that has a method to get
    #  $CUDA_VISIBLE_DEVICES env variable.
    @ray.remote(num_gpus=1, num_cpus=1)
    class Actor:
        def get_gpu(self):
            import os

            return os.environ["CUDA_VISIBLE_DEVICES"]

    # Create actors out of order.
    actors = []
    actors.append(
        Actor.options(placement_group=pg,
                      placement_group_bundle_index=0).remote())
    actors.append(
        Actor.options(placement_group=pg,
                      placement_group_bundle_index=3).remote())
    actors.append(
        Actor.options(placement_group=pg,
                      placement_group_bundle_index=2).remote())
    actors.append(
        Actor.options(placement_group=pg,
                      placement_group_bundle_index=1).remote())

    for actor in actors:
        gpu_ids = ray.get(actor.get_gpu.remote())
        assert len(gpu_ids) == 1
        gpu_ids_res.add(gpu_ids)

    assert len(gpu_ids_res) == 4
Example #19
0
def test_infeasible_pg(ray_start_cluster):
    """Test infeasible pgs are scheduled after new nodes are added."""
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=2)
    ray.init("auto")

    bundle = {"CPU": 4, "GPU": 1}
    pg = placement_group([bundle], name="worker_1", strategy="STRICT_PACK")

    # Placement group is infeasible.
    with pytest.raises(GetTimeoutError):
        ray.get(pg.ready(), timeout=3)

    state = ray.util.placement_group_table()[
        pg.id.hex()]["stats"]["scheduling_state"]
    assert state == "INFEASIBLE"

    # Add a new node. PG can now be scheduled.
    cluster.add_node(num_cpus=4, num_gpus=1)
    assert ray.get(pg.ready(), timeout=10)
Example #20
0
def pg_launcher(pre_created_pgs, num_pgs_to_create):
    pgs = []
    pgs += pre_created_pgs
    for i in range(num_pgs_to_create):
        pgs.append(placement_group(BUNDLES, strategy="STRICT_SPREAD"))

    pgs_removed = []
    pgs_unremoved = []
    # Randomly choose placement groups to remove.
    for pg in pgs:
        if random() < .5:
            pgs_removed.append(pg)
        else:
            pgs_unremoved.append(pg)

    tasks = []
    max_actor_cnt = 5
    actor_cnt = 0
    actors = []
    # Randomly schedule tasks or actors on placement groups that
    # are not removed.
    for pg in pgs_unremoved:
        # TODO(sang): Comment in this line causes GCS actor management
        # failure. We need to fix it.
        if random() < .5:
            tasks.append(mock_task.options(placement_group=pg).remote())
        else:
            if actor_cnt < max_actor_cnt:
                actors.append(MockActor.options(placement_group=pg).remote())
                actor_cnt += 1

    # Remove the rest of placement groups.
    for pg in pgs_removed:
        remove_placement_group(pg)

    ray.get([pg.ready() for pg in pgs_unremoved])
    ray.get(tasks)
    ray.get([actor.ping.remote() for actor in actors])
    # Since placement groups are scheduled, remove them.
    for pg in pgs_unremoved:
        remove_placement_group(pg)
Example #21
0
def test_chaos_defer(monkeypatch, ray_start_cluster):
    with monkeypatch.context() as m:
        m.setenv("RAY_grpc_based_resource_broadcast", "true")
        # defer for 3s
        m.setenv(
            "RAY_testing_asio_delay_us",
            "NodeManagerService.grpc_client.PrepareBundleResources=2000000:2000000",
        )
        m.setenv("RAY_event_stats", "true")
        cluster = ray_start_cluster
        cluster.add_node(num_cpus=1, object_store_memory=1e9)
        cluster.wait_for_nodes()
        ray.init(address="auto")  # this will connect to gpu nodes
        cluster.add_node(num_cpus=0, num_gpus=1)
        bundle = [{"GPU": 1}, {"CPU": 1}]
        pg = placement_group(bundle)
        # PG will not be ready within 3s
        with pytest.raises(ray.exceptions.GetTimeoutError):
            ray.get(pg.ready(), timeout=1)
        # it'll be ready eventually
        ray.get(pg.ready())
Example #22
0
 def __call__(self, *args, **kwargs):
     kwargs.update(self._bound.kwargs)
     # Call with bounded *args and **kwargs
     return placement_group(*self._bound.args, **kwargs)
Example #23
0
        value = torch.randn(batch_size)
        input = torch.randn(batch_size)
        return {
            "input": input,
            "value": value,
            "actor_id": self.id,
        }


tot_gpus = 1
tot_cpus = 8

pg_cnt = min(int(tot_gpus / bundle1['GPU']), int(tot_cpus / bundle1['CPU']))

pgs = [
    placement_group([bundle1], strategy="STRICT_PACK") for _ in range(pg_cnt)
]
# Wait until placement group is created.
ray.get([pg.ready() for pg in pgs])
for pg in pgs:
    print(placement_group_table(pg))

# You can look at placement group states using this API.
print(placement_group_table(pg))

actors = [
    AsyncActor.options(placement_group=pg).remote(i)
    for i, pg in enumerate(pgs)
]

# Get a list of the IP addresses of the nodes that have joined the cluster.
import ray
from ray.util.placement_group import (
    placement_group, )
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

# Two "CPU"s are available.
ray.init(num_cpus=2)

# Create a placement group.
pg = placement_group([{"CPU": 2}])
ray.get(pg.ready())


# Now, 2 CPUs are not available anymore because
# they are pre-reserved by the placement group.
@ray.remote(num_cpus=2)
def f():
    return True


# Won't be scheduled because there are no 2 cpus.
f.remote()

# Will be scheduled because 2 cpus are reserved by the placement group.
f.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
    placement_group=pg)).remote()
Example #25
0
    time.sleep(5)
logger.info("Nodes have all joined. There are %s resources.",
            ray.cluster_resources())


def hey(_):
    time.sleep(0.01)  # Sleep for 10ms
    return b"hey"


num_connections = int(num_remote_cpus * 0.75)
num_threads = 2
time_to_run = "10s"

pg = placement_group(
    [{
        "CPU": 1
    } for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
ray.get(pg.ready())

# The number of replicas is the number of cores remaining after accounting
# for the one HTTP proxy actor on each node, the "hey" requester task on each
# node, and the serve controller.
# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
num_replicas = ray.available_resources()["CPU"]
logger.info("Starting %i replicas", num_replicas)
client.create_backend(
    "hey", hey, config=BackendConfig(num_replicas=num_replicas))
client.create_endpoint("hey", backend="hey", route="/hey")


@ray.remote
# In[ ]:


@ray.remote
def remote_fun(x):
    return x


# In[ ]:

#tag::placement_group[]
# Create a placement group.
cpu_bundle = {"CPU": 3}
mini_cpu_bundle = {"CPU": 1}
pg = placement_group([cpu_bundle, mini_cpu_bundle])
ray.get(pg.ready())
print(placement_group_table(pg))
print(ray.available_resources())
# Run remote_fun in cpu_bundle
handle = remote_fun.options(placement_group=pg,
                            placement_group_bundle_index=0).remote(1)
#end::placement_group[]

# In[ ]:


#tag::runtime_env_local[]
@ray.remote(runtime_env=runtime_env)
def sup(x):
    from bs4 import BeautifulSoup
def run_trial(total_stage, num_pg_per_stage):
    creating_e2e_s = []
    removing_e2e_s = []
    # Create and remove placement groups.
    for i in range(total_stage):
        # Create pgs.
        pgs = []
        start = perf_counter()
        for _ in range(num_pg_per_stage):
            pgs.append(
                placement_group(bundles=[{
                    "custom": 0.025
                } for _ in range(4)],
                                strategy="PACK"))
        logger.info(f"Created {num_pg_per_stage} pgs.")
        ray.get([pg.ready() for pg in pgs])
        end = perf_counter()
        total_creating_time = end - start
        logger.info(f"Creating {num_pg_per_stage} took "
                    f"{total_creating_time} seconds at stage {i}")
        creating_e2e_s.append(total_creating_time * 1000.0)

        # Remove pgs
        start = perf_counter()
        for _, pg in enumerate(pgs):
            remove_placement_group(pg)
        end = perf_counter()
        total_removal_time = end - start
        logger.info(f"removed {num_pg_per_stage} pgs took "
                    f"{total_removal_time} seconds at stage {i}")
        removing_e2e_s.append(total_removal_time * 1000.0)
        # time.sleep(1)

    # Calculate the scheduling latency (excluding queueing time).
    latencies = []
    for entry in ray.util.placement_group_table().values():
        latency = entry["stats"]["scheduling_latency_ms"]
        latencies.append(latency)
    latencies = sorted(latencies)
    removing_e2e_s = sorted(removing_e2e_s)
    creating_e2e_s = sorted(creating_e2e_s)

    def get_scheduling_perf(latencies):
        """Return P10, 50, 95, 99 latency"""
        p10 = latencies[int(len(latencies) * 0.1)]
        p50 = latencies[int(len(latencies) * 0.5)]
        p95 = latencies[int(len(latencies) * 0.95)]
        p99 = latencies[int(len(latencies) * 0.99)]
        return {"p10_ms": p10, "p50_ms": p50, "p95_ms": p95, "p99_ms": p99}

    scheduling_perf = get_scheduling_perf(latencies)
    removing_perf = get_scheduling_perf(removing_e2e_s)
    creation_perf = get_scheduling_perf(creating_e2e_s)

    wait_for_condition(
        lambda: (ray.cluster_resources()["custom"] == ray.available_resources(
        )["custom"]),
        timeout=30,
    )
    wait_for_condition(
        lambda: (ray.cluster_resources()["pending"] == ray.available_resources(
        )["pending"]),
        timeout=30,
    )

    return scheduling_perf, removing_perf, creation_perf
Example #28
0
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30):
    request_resources(num_cpus=42)

    # add placement groups.
    pg_demands = [{"GPU": 2}, {"extra_resource": 2}]
    strategy = "STRICT_PACK"
    pg = placement_group(pg_demands, strategy=strategy)
    pg.ready()
    time.sleep(2)  # wait for placemnt groups to propogate.

    # Disable event clearing for test.
    monitor.event_summarizer.clear = lambda *a: None

    visited_atleast_once = [set(), set()]
    while True:
        monitor.update_load_metrics()
        monitor.update_resource_requests()
        monitor.update_event_summary()
        resource_usage = monitor.load_metrics._get_resource_usage()

        # Check resource request propagation.
        req = monitor.load_metrics.resource_requests
        assert req == [{"CPU": 1}] * 42, req

        pg_response_data = monitor.load_metrics.pending_placement_groups
        assert_correct_pg(pg_response_data, pg_demands, strategy)

        if "memory" in resource_usage[0]:
            del resource_usage[0]["memory"]
            visited_atleast_once[0].add("memory")
        if "object_store_memory" in resource_usage[0]:
            del resource_usage[0]["object_store_memory"]
            visited_atleast_once[0].add("object_store_memory")
        if "memory" in resource_usage[1]:
            del resource_usage[1]["memory"]
            visited_atleast_once[1].add("memory")
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[1]["object_store_memory"]
            visited_atleast_once[1].add("object_store_memory")
        for key in list(resource_usage[0].keys()):
            if key.startswith("node:"):
                del resource_usage[0][key]
                visited_atleast_once[0].add("node:")
        for key in list(resource_usage[1].keys()):
            if key.startswith("node:"):
                del resource_usage[1][key]
                visited_atleast_once[1].add("node:")
        if expected_resource_usage is None:
            if all(x for x in resource_usage[0:]):
                break
        elif all(x == y
                 for x, y in zip(resource_usage, expected_resource_usage)):
            break
        else:
            timeout -= 1
            time.sleep(1)

        if timeout <= 0:
            raise ValueError("Timeout. {} != {}".format(
                resource_usage, expected_resource_usage))

    # Sanity check we emitted a resize event.
    assert any("Resized to" in x for x in monitor.event_summarizer.summary())

    assert visited_atleast_once[0] == {
        "memory", "object_store_memory", "node:"
    }
    assert visited_atleast_once[0] == visited_atleast_once[1]

    remove_placement_group(pg)

    return resource_usage
Example #29
0
                    ray.cluster_resources())

        # Scenario 1: Create bunch of placement groups and measure how long
        # it takes.
        total_creating_time = 0
        total_removing_time = 0
        repeat = 1
        total_trial = repeat * NUM_PG
        BUNDLES = [{"pg_custom": 1}] * NUM_NODES

        # Create and remove placement groups.
        for _ in range(repeat):
            pgs = []
            for i in range(NUM_PG):
                start = perf_counter()
                pgs.append(placement_group(BUNDLES, strategy="PACK"))
                end = perf_counter()
                logger.info(f"append_group iteration {i}")
                total_creating_time += (end - start)

            ray.get([pg.ready() for pg in pgs])

            for i, pg in enumerate(pgs):
                start = perf_counter()
                remove_placement_group(pg)
                end = perf_counter()
                logger.info(f"remove_group iteration {i}")
                total_removing_time += (end - start)

        # Validate the correctness.
        assert ray.cluster_resources()[
Example #30
0
import time
import ray
from ray.util.placement_group import (placement_group, placement_group_table,
                                      remove_placement_group)

if __name__ == "__main__":
    ray.init(num_cpus=2, resources={"extra_resources": 2})
    bundle_1 = {"CPU": 2}
    bundle_2 = {"extra_resources": 2}

    pg = placement_group([bundle_1, bundle_2], strategy="STRICT_PACK")

    # You can also use ray.wait.
    ready, unready = ray.wait([pg.ready()], timeout=5)
    print(f"placement group status:{ready}")
    print(placement_group_table(pg))

    time.sleep(10)