Example #1
0
def test_atomic_creation(ray_start_cluster):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 5

    nodes = [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    # Create an actor that will fail bundle scheduling.
    # It is important to use pack strategy to make test less flaky.
    pg = ray.util.placement_group(name="name",
                                  strategy="PACK",
                                  bundles=[{
                                      "CPU": bundle_cpu_size
                                  } for _ in range(num_nodes * bundle_per_node)
                                           ])

    # Create a placement group actor.
    # This shouldn't be scheduled until placement group creation is done.
    pg_actor = NormalActor.options(
        placement_group=pg,
        placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote()
    # Destroy some nodes to fail placement group creation.
    nodes_to_kill = get_other_nodes(cluster, exclude_head=True)
    for node_to_kill in nodes_to_kill:
        cluster.remove_node(node_to_kill)

    # Wait on the placement group now. It should be unready
    # because normal actor takes resources that are required
    # for one of bundle creation.
    ready, unready = ray.wait([pg.ready()], timeout=0)
    assert len(ready) == 0
    assert len(unready) == 1

    # Add a node back to schedule placement group.
    for _ in range(len(nodes_to_kill)):
        nodes.append(
            cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node))
    # Wait on the placement group creation.
    ready, unready = ray.wait([pg.ready()])
    assert len(ready) == 1
    assert len(unready) == 0

    # Confirm that the placement group actor is created. It will
    # raise an exception if actor was scheduled before placement group was
    # created.
    # TODO(sang): This with statement should be removed after atomic creation
    # is implemented. It will be done in the next PR.
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(pg_actor.ping.remote(), timeout=3.0)
Example #2
0
def test_ray_wait_dead_actor(ray_start_cluster):
    """Tests that methods completed by dead actors are returned as ready"""
    cluster = ray_start_cluster

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            pass

        def node_id(self):
            return ray.worker.global_worker.node.unique_id

        def ping(self):
            time.sleep(1)

    # Create some actors and wait for them to initialize.
    num_nodes = len(cluster.list_all_nodes())
    actors = [Actor.remote() for _ in range(num_nodes)]
    ray.get([actor.ping.remote() for actor in actors])

    def actor_dead():
        # Ping the actors and make sure the tasks complete.
        ping_ids = [actor.ping.remote() for actor in actors]
        unready = ping_ids[:]
        while unready:
            _, unready = ray.wait(unready, timeout=0)
            time.sleep(1)

        try:
            ray.get(ping_ids)
            return False
        except ray.exceptions.RayActorError:
            return True

    # Kill a node that must not be driver node or head node.
    cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
    # Repeatedly submit tasks and call ray.wait until the exception for the
    # dead actor is received.
    wait_for_condition(actor_dead)

    # Create an actor on the local node that will call ray.wait in a loop.
    head_node_resource = "HEAD_NODE"
    ray.experimental.set_resource(head_node_resource, 1)

    @ray.remote(num_cpus=0, resources={head_node_resource: 1})
    class ParentActor:
        def __init__(self):
            pass

        def wait(self):
            return actor_dead()

        def ping(self):
            return

    # Repeatedly call ray.wait through the local actor until the exception for
    # the dead actor is received.
    parent_actor = ParentActor.remote()
    wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))
Example #3
0
def test_placement_group_reschedule_when_node_dead(ray_start_cluster):
    @ray.remote(num_cpus=1)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Make sure both head and worker node are alive.
    nodes = ray.nodes()
    assert len(nodes) == 3
    assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]

    placement_group = ray.util.placement_group(name="name",
                                               strategy="SPREAD",
                                               bundles=[{
                                                   "CPU": 2
                                               }, {
                                                   "CPU": 2
                                               }, {
                                                   "CPU": 2
                                               }])
    actor_1 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=0,
                            lifetime="detached").remote()
    actor_2 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=1,
                            lifetime="detached").remote()
    actor_3 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=2,
                            lifetime="detached").remote()
    ray.get(actor_1.value.remote())
    ray.get(actor_2.value.remote())
    ray.get(actor_3.value.remote())

    cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
    cluster.wait_for_nodes()

    actor_4 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=0,
                            lifetime="detached").remote()
    actor_5 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=1,
                            lifetime="detached").remote()
    actor_6 = Actor.options(placement_group=placement_group,
                            placement_group_bundle_index=2,
                            lifetime="detached").remote()
    ray.get(actor_4.value.remote())
    ray.get(actor_5.value.remote())
    ray.get(actor_6.value.remote())
    ray.shutdown()
def test_actor_creation_node_failure(ray_start_cluster):
    # TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
    cluster = ray_start_cluster

    @ray.remote
    class Child:
        def __init__(self, death_probability):
            self.death_probability = death_probability

        def get_probability(self):
            return self.death_probability

        def ping(self):
            # Exit process with some probability.
            exit_chance = np.random.rand()
            if exit_chance < self.death_probability:
                sys.exit(-1)

    num_children = 25
    # Children actors will die about half the time.
    death_probability = 0.5

    children = [Child.remote(death_probability) for _ in range(num_children)]
    while len(cluster.list_all_nodes()) > 1:
        for j in range(2):
            # Submit some tasks on the actors. About half of the actors will
            # fail.
            children_out = [child.ping.remote() for child in children]
            # Wait a while for all the tasks to complete. This should trigger
            # reconstruction for any actor creation tasks that were forwarded
            # to nodes that then failed.
            ready, _ = ray.wait(children_out,
                                num_returns=len(children_out),
                                timeout=5 * 60.0)
            assert len(ready) == len(children_out)

            # Replace any actors that died.
            for i, out in enumerate(children_out):
                try:
                    ray.get(out)
                except ray.exceptions.RayActorError:
                    children[i] = Child.remote(death_probability)

            children_out = [
                child.get_probability.remote() for child in children
            ]
            # Wait for new created actors to finish creation before
            # removing a node. This is needed because right now we don't
            # support reconstructing actors that died in the process of
            # being created.
            ready, _ = ray.wait(children_out,
                                num_returns=len(children_out),
                                timeout=5 * 60.0)
            assert len(ready) == len(children_out)

        # Remove a node. Any actor creation tasks that were forwarded to this
        # node must be restarted.
        cluster.remove_node(get_other_nodes(cluster, True)[-1])
def check_components_alive(cluster, component_type, check_component_alive):
    """Check that a given component type is alive on all worker nodes."""
    worker_nodes = get_other_nodes(cluster)
    assert len(worker_nodes) > 0
    for node in worker_nodes:
        process = node.all_processes[component_type][0].process
        if check_component_alive:
            assert process.poll() is None
        else:
            print("waiting for " + component_type + " with PID " +
                  str(process.pid) + "to terminate")
            process.wait()
            print("done waiting for " + component_type + " with PID " +
                  str(process.pid) + "to terminate")
            assert not process.poll() is None
Example #6
0
def _test_component_failed(cluster, component_type):
    """Kill a component on all worker nodes and check workload succeeds."""
    # Submit many tasks with many dependencies.
    @ray.remote
    def f(x):
        # Sleep to make sure that tasks actually fail mid-execution.
        time.sleep(0.01)
        return x

    @ray.remote
    def g(*xs):
        # Sleep to make sure that tasks actually fail mid-execution. We
        # only use it for direct calls because the test already takes a
        # long time to run with the raylet codepath.
        time.sleep(0.01)
        return 1

    # Kill the component on all nodes except the head node as the tasks
    # execute. Do this in a loop while submitting tasks between each
    # component failure.
    time.sleep(0.1)
    worker_nodes = get_other_nodes(cluster)
    assert len(worker_nodes) > 0
    for node in worker_nodes:
        process = node.all_processes[component_type][0].process
        # Submit a round of tasks with many dependencies.
        x = 1
        for _ in range(1000):
            x = f.remote(x)

        xs = [g.remote(1)]
        for _ in range(100):
            xs.append(g.remote(*xs))
            xs.append(g.remote(1))

        # Kill a component on one of the nodes.
        process.terminate()
        time.sleep(1)
        process.kill()
        process.wait()
        assert not process.poll() is None

        # Make sure that we can still get the objects after the
        # executing tasks died.
        ray.get(x)
        ray.get(xs)
Example #7
0
def test_object_reconstruction(ray_start_cluster):
    cluster = ray_start_cluster

    # Submit tasks with dependencies in plasma.
    @ray.remote
    def large_value():
        # Sleep for a bit to force tasks onto different nodes.
        time.sleep(0.1)
        return np.zeros(10 * 1024 * 1024)

    @ray.remote
    def g(x):
        return

    # Kill the component on all nodes except the head node as the tasks
    # execute. Do this in a loop while submitting tasks between each
    # component failure.
    time.sleep(0.1)
    worker_nodes = get_other_nodes(cluster)
    assert len(worker_nodes) > 0
    component_type = ray_constants.PROCESS_TYPE_RAYLET
    for node in worker_nodes:
        process = node.all_processes[component_type][0].process
        # Submit a round of tasks with many dependencies.
        num_tasks = len(worker_nodes)
        xs = [large_value.remote() for _ in range(num_tasks)]
        # Wait for the tasks to complete, then evict the objects from the local
        # node.
        for x in xs:
            ray.get(x)
            ray.internal.free([x], local_only=True)

        # Kill a component on one of the nodes.
        process.terminate()
        time.sleep(1)
        process.kill()
        process.wait()
        assert not process.poll() is None

        # Make sure that we can still get the objects after the
        # executing tasks died.
        print("F", xs)
        xs = [g.remote(x) for x in xs]
        print("G", xs)
        ray.get(xs)
Example #8
0
    return 1


iteration = 0
previous_ids = [1 for _ in range(100)]
start_time = time.time()
previous_time = start_time
while True:
    for _ in range(100):
        previous_ids = [f.remote(previous_id) for previous_id in previous_ids]

    ray.get(previous_ids)

    for _ in range(100):
        previous_ids = [f.remote(previous_id) for previous_id in previous_ids]
    node_to_kill = get_other_nodes(cluster, exclude_head=True)[0]

    # Remove the first non-head node.
    cluster.remove_node(node_to_kill)
    cluster.add_node()

    new_time = time.time()
    print("Iteration {}:\n"
          "  - Iteration time: {}.\n"
          "  - Absolute time: {}.\n"
          "  - Total elapsed time: {}.".format(iteration,
                                               new_time - previous_time,
                                               new_time,
                                               new_time - start_time))
    previous_time = new_time
    iteration += 1