# run evaluations and get the results evaluations = [ Evaluation.remote(update_id) for _ in range(ASYNC_EVALUATIONS) ] results = ray.get( [evaluation.run.remote() for evaluation in evaluations] ) # tally the results apprentice_wins = sum(results) print(f'the apprentice won {apprentice_wins} games...') update = (apprentice_wins/NUM_EVAL_PLAYS) > WIN_RATIO # update if neccesary if update: # increment and get the the current update id update_id = ray.get(update_signal.set_update_id.remote()) # block until new alpha parameters are saved, indexed by current # update_id ray.get( evaluations[0].update_alpha_parameters.remote(update_id) ) # send update signal to self-play actors update_signal.send_update.remote() # free up resources by killing the evaluation actors for evaluation in evaluations: ray.kill(evaluation) # manual garbage collection gc.collect()
def _kill_http_proxy(): [http_proxy ] = ray.get(serve.api._get_master_actor().get_http_proxy.remote()) ray.kill(http_proxy)
def run(self): while True: ray.kill(random.choice(self._get_all_serve_actors()), no_restart=False) time.sleep(self.kill_period_s)
def test_atomic_creation(ray_start_cluster): # Setup cluster. cluster = ray_start_cluster bundle_cpu_size = 2 bundle_per_node = 2 num_nodes = 2 [ cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node) for _ in range(num_nodes) ] ray.init(address=cluster.address) @ray.remote(num_cpus=1) class NormalActor: def ping(self): pass @ray.remote(num_cpus=3) def bothering_task(): import time time.sleep(1) return True # Schedule tasks to fail initial placement group creation. tasks = [bothering_task.remote() for _ in range(2)] # Create an actor that will fail bundle scheduling. # It is important to use pack strategy to make test less flaky. pg = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": bundle_cpu_size } for _ in range(num_nodes * bundle_per_node) ]) # Create a placement group actor. # This shouldn't be scheduled because atomic # placement group creation should've failed. pg_actor = NormalActor.options( placement_group=pg, placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote() # Wait on the placement group now. It should be unready # because normal actor takes resources that are required # for one of bundle creation. ready, unready = ray.wait([pg.ready()], timeout=0) assert len(ready) == 0 assert len(unready) == 1 # Wait until all tasks are done. assert all(ray.get(tasks)) # Wait on the placement group creation. Since resources are now available, # it should be ready soon. ready, unready = ray.wait([pg.ready()]) assert len(ready) == 1 assert len(unready) == 0 # Confirm that the placement group actor is created. It will # raise an exception if actor was scheduled before placement # group was created thus it checks atomicity. ray.get(pg_actor.ping.remote(), timeout=3.0) ray.kill(pg_actor) # Make sure atomic creation failure didn't impact resources. @ray.remote(num_cpus=bundle_cpu_size) def resource_check(): return True # This should hang because every resources # are claimed by placement group. check_without_pg = [ resource_check.remote() for _ in range(bundle_per_node * num_nodes) ] # This all should scheduled on each bundle. check_with_pg = [ resource_check.options(placement_group=pg, placement_group_bundle_index=i).remote() for i in range(bundle_per_node * num_nodes) ] # Make sure these are hanging. ready, unready = ray.wait(check_without_pg, timeout=0) assert len(ready) == 0 assert len(unready) == bundle_per_node * num_nodes # Make sure these are all scheduled. assert all(ray.get(check_with_pg)) ray.util.remove_placement_group(pg) def pg_removed(): return ray.util.placement_group_table(pg)["state"] == "REMOVED" wait_for_condition(pg_removed) # Make sure check without pgs are all # scheduled properly because resources are cleaned up. assert all(ray.get(check_without_pg))
def _kill_http_proxies(client): http_proxies = ray.get(client._controller.get_http_proxies.remote()) for http_proxy in http_proxies.values(): ray.kill(http_proxy, no_restart=False)
def test_object_unpin(ray_start_cluster): nodes = [] cluster = ray_start_cluster head_node = cluster.add_node(num_cpus=0, object_store_memory=100 * 1024 * 1024, _system_config={ "num_heartbeats_timeout": 10, "subscriber_timeout_ms": 100 }) ray.init(address=cluster.address) # Add worker nodes. for i in range(2): nodes.append( cluster.add_node(num_cpus=1, resources={f"node_{i}": 1}, object_store_memory=100 * 1024 * 1024)) cluster.wait_for_nodes() one_mb_array = np.ones(1 * 1024 * 1024, dtype=np.uint8) ten_mb_array = np.ones(10 * 1024 * 1024, dtype=np.uint8) @ray.remote class ObjectsHolder: def __init__(self): self.ten_mb_objs = [] self.one_mb_objs = [] def put_10_mb(self): self.ten_mb_objs.append(ray.put(ten_mb_array)) def put_1_mb(self): self.one_mb_objs.append(ray.put(one_mb_array)) def pop_10_mb(self): if len(self.ten_mb_objs) == 0: return False self.ten_mb_objs.pop() return True def pop_1_mb(self): if len(self.one_mb_objs) == 0: return False self.one_mb_objs.pop() return True # Head node contains 11MB of data. one_mb_arrays = [] ten_mb_arrays = [] one_mb_arrays.append(ray.put(one_mb_array)) ten_mb_arrays.append(ray.put(ten_mb_array)) def check_memory(mb): return ((f"Plasma memory usage {mb} " "MiB" in memory_summary(address=head_node.address, stats_only=True))) def wait_until_node_dead(node): for n in ray.nodes(): if (n["ObjectStoreSocketName"] == node.address_info["object_store_address"]): return not n["Alive"] return False wait_for_condition(lambda: check_memory(11)) # Pop one mb array and see if it works. one_mb_arrays.pop() wait_for_condition(lambda: check_memory(10)) # Pop 10 MB. ten_mb_arrays.pop() wait_for_condition(lambda: check_memory(0)) # Put 11 MB for each actor. # actor 1: 1MB + 10MB # actor 2: 1MB + 10MB actor_on_node_1 = ObjectsHolder.options(resources={"node_0": 1}).remote() actor_on_node_2 = ObjectsHolder.options(resources={"node_1": 1}).remote() ray.get(actor_on_node_1.put_1_mb.remote()) ray.get(actor_on_node_1.put_10_mb.remote()) ray.get(actor_on_node_2.put_1_mb.remote()) ray.get(actor_on_node_2.put_10_mb.remote()) wait_for_condition(lambda: check_memory(22)) # actor 1: 10MB # actor 2: 1MB ray.get(actor_on_node_1.pop_1_mb.remote()) ray.get(actor_on_node_2.pop_10_mb.remote()) wait_for_condition(lambda: check_memory(11)) # The second node is dead, and actor 2 is dead. cluster.remove_node(nodes[1], allow_graceful=False) wait_for_condition(lambda: wait_until_node_dead(nodes[1])) wait_for_condition(lambda: check_memory(10)) # The first actor is dead, so object should be GC'ed. ray.kill(actor_on_node_1) wait_for_condition(lambda: check_memory(0))
async def router(serve_instance): q = ray.remote(Router).remote(serve_instance._controller) yield q ray.kill(q)
def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): timeout = 5 assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis(host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) p = client.pubsub(ignore_subscribe_messages=True) p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN) @ray.remote class DummyActor: def __init__(self): pass # Create a dummy actor. a = DummyActor.remote() def handle_pub_messages(client, msgs, timeout, expect_num): start_time = time.time() while time.time() - start_time < timeout and len(msgs) < expect_num: msg = client.get_message() if msg is None: time.sleep(0.01) continue pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"]) actor_data = ray.gcs_utils.ActorTableData.FromString( pubsub_msg.data) msgs.append(actor_data) msgs = [] handle_pub_messages(p, msgs, timeout, 2) # Assert we received published actor messages with state # DEPENDENCIES_UNREADY and ALIVE. assert len(msgs) == 2 # Kill actor. ray.kill(a) handle_pub_messages(p, msgs, timeout, 3) # Assert we received published actor messages with state DEAD. assert len(msgs) == 3 def actor_table_data_to_dict(message): return dashboard_utils.message_to_dict( message, { "actorId", "parentId", "jobId", "workerId", "rayletId", "actorCreationDummyObjectId", "callerId", "taskId", "parentTaskId", "sourceActorId", "placementGroupId" }, including_default_value_fields=False) non_state_keys = ("actorId", "jobId", "taskSpec") for msg in msgs: actor_data_dict = actor_table_data_to_dict(msg) # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We # need check its original value. if msg.state == 0: assert len(actor_data_dict) > 5 for k in non_state_keys: assert k in actor_data_dict # For status that is not DEPENDENCIES_UNREADY, only states fields will # be published. elif actor_data_dict["state"] in ("ALIVE", "DEAD"): assert actor_data_dict.keys() == { "state", "address", "timestamp", "pid", "creationTaskException" } else: raise Exception("Unknown state: {}".format( actor_data_dict["state"]))
help="start server without blocking") args = parser.parse_args() os.makedirs(args.save_path, exist_ok=True) assert os.path.isdir(args.seesaw_root) ray.init("auto", namespace="seesaw", log_to_driver=True) seesaw_root = os.path.abspath(os.path.expanduser(args.seesaw_root)) save_path = os.path.abspath(os.path.expanduser(args.save_path)) actor_name = "session_manager" try: oldh = ray.get_actor(actor_name) print( "found old session_manager actor, destroying it (old sessions will be lost)" ) ray.kill(oldh) except: pass session_manager = SessionManagerActor.options(name=actor_name).remote( root_dir=seesaw_root, save_path=save_path, num_cpus_per_session=args.num_cpus) ray.get(session_manager.ready.remote()) uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info")
def reset(self): for worker in self.remote_workers: logger.debug(f"Killing worker {worker}.") ray.kill(worker) self.remote_workers = []
def test_task_level_gc(ray_start_cluster, option): """Tests that task-level working_dir is GC'd when the worker exits.""" cluster = ray_start_cluster soft_limit_zero = False worker_register_timeout = False system_config = cluster.list_all_nodes()[0]._ray_params._system_config if "num_workers_soft_limit" in system_config and \ system_config["num_workers_soft_limit"] == 0: soft_limit_zero = True if "worker_register_timeout_seconds" in system_config and \ system_config["worker_register_timeout_seconds"] != 0: worker_register_timeout = True @ray.remote def f(): import test_module test_module.one() @ray.remote(num_cpus=1) class A: def check(self): import test_module test_module.one() if option == "working_dir": runtime_env = {"working_dir": S3_PACKAGE_URI} else: runtime_env = {"py_modules": [S3_PACKAGE_URI]} # Note: We should set a bigger timeout if downloads the s3 package slowly. get_timeout = 10 # Start a task with runtime env if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get( f.options(runtime_env=runtime_env).remote(), timeout=get_timeout) else: ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a actor with runtime env actor = A.options(runtime_env=runtime_env).remote() if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get(actor.check.remote(), timeout=get_timeout) # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: ray.get(actor.check.remote()) assert not check_local_files_gced(cluster) # Kill actor ray.kill(actor) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a task with runtime env if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get( f.options(runtime_env=runtime_env).remote(), timeout=get_timeout) else: ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster)
def end_session(self, session_id): ## session should die after reference sess = self.sessions[session_id] del self.sessions[session_id] print(f"ending session {session_id}") ray.kill(sess)
def shutdown(self) -> None: for proxy in self.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True)
def shutdown(self): for handle in self.actor_handles: ray.kill(handle) self.actor_handles.clear()
def test_delete_objects_multi_node(tmp_path, ray_start_cluster): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" temp_folder.mkdir() cluster = ray_start_cluster # Head node. cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 2, "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": json.dumps({ "type": "filesystem", "params": { "directory_path": str(temp_folder) } }), }) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) ray.init(address=cluster.address) arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote(num_cpus=1) class Actor: def __init__(self): self.replay_buffer = [] def ping(self): return def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] ray.get([actor.create_objects.remote() for actor in actors]) def wait_until_actor_dead(actor): try: ray.get(actor.ping.remote()) except ray.exceptions.RayActorError: return True return False def is_dir_empty(): num_files = 0 for path in temp_folder.iterdir(): num_files += 1 return num_files == 0 # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. wait_for_condition(is_dir_empty)
def _kill_router(): [router] = ray.get(serve.api._get_master_actor().get_router.remote()) ray.kill(router, no_restart=False)
def test_multiple_routers(ray_cluster): cluster = ray_cluster head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, serve.api._global_client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
def handle_result(self, results): print(results) if self.counter == self.fail_on: ray.kill(self.worker_group.workers[0].actor) time.sleep(3) self.counter += 1
def logging_loop(self): """ Keep track of the training performance. """ # Launch the test worker to get performance metrics test_worker = self_play.SelfPlay.options( num_gpus=self.config.selfplay_num_gpus if "cuda" in self.config.selfplay_device else 0 ).remote( copy.deepcopy(self.muzero_weights), self.Game, self.config, self.config.seed + self.config.num_workers, ) test_worker.continuous_self_play.remote(self.shared_storage_worker, None, True) # Write everything in TensorBoard writer = SummaryWriter(self.config.results_path) print( "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" ) # Save hyperparameters to TensorBoard hp_table = [ f"| {key} | {value} |" for key, value in self.config.__dict__.items() ] writer.add_text( "Hyperparameters", "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) # Save model representation writer.add_text( "Model summary", self.summary, ) # Loop for updating the training performance counter = 0 info = ray.get(self.shared_storage_worker.get_info.remote()) try: while info["training_step"] < self.config.training_steps: info = ray.get(self.shared_storage_worker.get_info.remote()) writer.add_scalar( "1.Total reward/1.Total reward", info["total_reward"], counter, ) writer.add_scalar( "1.Total reward/2.Mean value", info["mean_value"], counter, ) writer.add_scalar( "1.Total reward/3.Episode length", info["episode_length"], counter, ) writer.add_scalar( "1.Total reward/4.MuZero reward", info["muzero_reward"], counter, ) writer.add_scalar( "1.Total reward/5.Opponent reward", info["opponent_reward"], counter, ) writer.add_scalar( "2.Workers/1.Self played games", info["num_played_games"], counter, ) writer.add_scalar( "2.Workers/2.Training steps", info["training_step"], counter ) writer.add_scalar( "2.Workers/3.Self played steps", info["num_played_steps"], counter ) writer.add_scalar( "2.Workers/4.Reanalysed games", info["num_reanalysed_games"], counter, ) writer.add_scalar( "2.Workers/5.Training steps per self played step ratio", info["training_step"] / max(1, info["num_played_steps"]), counter, ) writer.add_scalar("2.Workers/6.Learning rate", info["lr"], counter) writer.add_scalar( "3.Loss/1.Total weighted loss", info["total_loss"], counter ) writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter) writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter) writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter) print( f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', end="\r", ) counter += 1 time.sleep(0.5) except KeyboardInterrupt: ray.kill(test_worker) ray.kill(self.training_worker) for worker in self.self_play_workers: ray.kill(worker) self.muzero_weights = ray.get(self.shared_storage_worker.get_weights.remote()) self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) if self.config.save_weights: # Persist replay buffer to disk print("\n\nPersisting replay buffer games to disk...") pickle.dump( self.replay_buffer, open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"), )
def test_recover_start_from_replica_actor_names(serve_instance): """Test controller is able to recover starting -> running replicas from actor names. """ # Test failed to deploy with total of 2 replicas, # but first constructor call fails. @serve.deployment(name="recover_start_from_replica_actor_names", num_replicas=2) class TransientConstructorFailureDeployment: def __init__(self): return True def __call__(self, *args): return "hii" TransientConstructorFailureDeployment.deploy() for _ in range(10): response = request_with_retries( "/recover_start_from_replica_actor_names/", timeout=30) assert response.text == "hii" # Assert 2 replicas are running in deployment deployment after partially # successful deploy() call with transient error deployment_dict = ray.get( serve_instance._controller._all_running_replicas.remote()) assert len(deployment_dict["recover_start_from_replica_actor_names"]) == 2 replica_version_hash = None for replica in deployment_dict["recover_start_from_replica_actor_names"]: ref = replica.actor_handle.get_metadata.remote() _, version = ray.get(ref) if replica_version_hash is None: replica_version_hash = hash(version) assert replica_version_hash == hash(version), ( "Replica version hash should be the same for " "same code version and user config.") # Sample: [ # 'TransientConstructorFailureDeployment#xlituP', # 'SERVE_CONTROLLER_ACTOR', # 'TransientConstructorFailureDeployment#NosHNA', # 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-node:192.168.86.165-0'] all_actor_names = ray.util.list_named_actors() all_replica_names = [ actor_name for actor_name in all_actor_names if (SERVE_CONTROLLER_NAME not in actor_name and SERVE_PROXY_NAME not in actor_name) ] assert (len(all_replica_names) == 2 ), "Should have two running replicas fetched from ray API." # Kill controller and wait for endpoint to be available again ray.kill(serve.context._global_client._controller, no_restart=False) for _ in range(10): response = request_with_retries( "/recover_start_from_replica_actor_names/", timeout=30) assert response.text == "hii" # Ensure recovered replica names are the same recovered_all_actor_names = ray.util.list_named_actors() recovered_all_replica_names = [ actor_name for actor_name in recovered_all_actor_names if (SERVE_CONTROLLER_NAME not in actor_name and SERVE_PROXY_NAME not in actor_name) ] assert (recovered_all_replica_names == all_replica_names ), "Running replica actor names after recovery must match" # Ensure recovered replica version has are the same for replica_name in recovered_all_replica_names: actor_handle = ray.get_actor(replica_name) ref = actor_handle.get_metadata.remote() _, version = ray.get(ref) assert replica_version_hash == hash( version ), "Replica version hash should be the same after recover from actor names"
def test_automatic_cleanup_detached_actors(ray_start_cluster): # Make sure the placement groups created by a # detached actors are cleaned properly. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 2 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) info = ray.init(address=cluster.address) available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["redis_address"]}") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg # TODO(sang): Placement groups created by tasks launched by detached actor # is not cleaned with the current protocol. # @ray.remote(num_cpus=0) # def f(): # create_pg() @ray.remote(num_cpus=0, max_restarts=1) class A: def create_pg(self): create_pg() def create_child_pg(self): self.a = A.options(name="B").remote() ray.get(self.a.create_pg.remote()) def kill_child_actor(self): ray.kill(self.a) try: ray.get(self.a.create_pg.remote()) except Exception: pass a = A.options(lifetime="detached", name="A").remote() ray.get(a.create_pg.remote()) # TODO(sang): Currently, child tasks are cleaned when a detached actor # is dead. We cannot test this scenario until it is fixed. # ray.get(a.create_child_pg.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) assert assert_num_cpus(num_nodes) # Make sure when a child actor spawned by a detached actor # is killed, the placement group is removed. a = ray.get_actor("A") # TODO(sang): child of detached actors # seem to be killed when jobs are done. We should fix this before # testing this scenario. # ray.get(a.kill_child_actor.remote()) # assert assert_num_cpus(num_nodes) # Make sure placement groups are cleaned when detached actors are killed. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_recover_rolling_update_from_replica_actor_names(serve_instance): """Test controller is able to recover starting -> updating -> running replicas from actor names, with right replica versions during rolling update. """ client = serve_instance name = "test" @ray.remote(num_cpus=0) def call(block=False): handle = serve.get_deployment(name).get_handle() ret = ray.get(handle.handler.remote(block)) return ret.split("|")[0], ret.split("|")[1] signal_name = f"signal#{get_random_letters()}" signal = SignalActor.options(name=signal_name).remote() @serve.deployment(name=name, version="1", num_replicas=2) class V1: async def handler(self, block: bool): if block: signal = ray.get_actor(signal_name) await signal.wait.remote() return f"1|{os.getpid()}" async def __call__(self, request): return await self.handler(request.query_params["block"] == "True") class V2: async def handler(self, *args): return f"2|{os.getpid()}" async def __call__(self, request): return await self.handler() def make_nonblocking_calls(expected, expect_blocking=False, num_returns=1): # Returns dict[val, set(pid)]. blocking = [] responses = defaultdict(set) start = time.time() timeout_value = 60 if sys.platform == "win32" else 30 while time.time() - start < timeout_value: refs = [call.remote(block=False) for _ in range(10)] ready, not_ready = ray.wait(refs, timeout=5, num_returns=num_returns) for ref in ready: val, pid = ray.get(ref) responses[val].add(pid) for ref in not_ready: blocking.extend(not_ready) if all( len(responses[val]) >= num for val, num in expected.items()) and ( expect_blocking is False or len(blocking) > 0): break else: assert False, f"Timed out, responses: {responses}." return responses, blocking V1.deploy() responses1, _ = make_nonblocking_calls({"1": 2}, num_returns=2) pids1 = responses1["1"] # ref2 will block a single replica until the signal is sent. Check that # some requests are now blocking. ref2 = call.remote(block=True) responses2, blocking2 = make_nonblocking_calls({"1": 1}, expect_blocking=True) assert list(responses2["1"])[0] in pids1 ray.kill(serve.context._global_client._controller, no_restart=False) # Redeploy new version. Since there is one replica blocking, only one new # replica should be started up. V2 = V1.options(func_or_class=V2, version="2") V2.deploy(_blocking=False) with pytest.raises(TimeoutError): client._wait_for_deployment_healthy(V2.name, timeout_s=0.1) responses3, blocking3 = make_nonblocking_calls({"1": 1}, expect_blocking=True) ray.kill(serve.context._global_client._controller, no_restart=False) # Signal the original call to exit. ray.get(signal.send.remote()) val, pid = ray.get(ref2) assert val == "1" assert pid in responses1["1"] # Now the goal and requests to the new version should complete. # We should have two running replicas of the new version. client._wait_for_deployment_healthy(V2.name) make_nonblocking_calls({"2": 2}, num_returns=2)
def test_capture_child_actors(ray_start_cluster): cluster = ray_start_cluster total_num_actors = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_actors) ray.init(address=cluster.address) pg = ray.util.placement_group([{ "CPU": 2 }, { "CPU": 2 }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor(self): # Make sure we can capture the current placement group. assert get_current_placement_group() is not None # Actors should be implicitly captured. actor = NestedActor.remote() ray.get(actor.ready.remote()) self.actors.append(actor) def schedule_nested_actor_outside_pg(self): # Don't use placement group. actor = NestedActor.options(placement_group=None).remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are scheduled on the same node. # (why? The placement group has STRICT_PACK strategy). node_id_set = set() for actor_info in ray.actors().values(): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) # Since all node id should be identical, set should be equal to 1. assert len(node_id_set) == 1 # Kill an actor and wait until it is killed. ray.kill(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Now create an actor, but do not capture the current tasks a = Actor.options(placement_group=pg, placement_group_capture_child_tasks=False).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.actors().values(): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2 # Kill an actor and wait until it is killed. ray.kill(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor_outside_pg.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.actors().values(): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2
def run(self, worker_fn: Callable, callbacks: Optional[List[Callable]] = None) -> List[Any]: """Executes the provided function on all workers. Args: worker_fn: Target elastic function that can be executed. callbacks: List of callables. Each callback must either be a callable function or a class that implements __call__. Every callback will be invoked on every value logged by the rank 0 worker. Returns: List of return values from every completed worker. """ return_values = [] from ray.util.queue import Queue import inspect args = inspect.getfullargspec(Queue).args if "actor_options" not in args: # Ray 1.1 and less _queue = Queue() else: _queue = Queue(actor_options={ "num_cpus": 0, "resources": { ray.state.current_node_id(): 0.001 } }) self.driver.start( self.settings.num_proc, self._create_spawn_worker_fn(return_values, worker_fn, _queue)) def _process_calls(queue, callbacks, event): if not callbacks: return while queue.actor: if not queue.empty(): result = queue.get_nowait() for c in callbacks: c(result) # avoid slamming the CI elif event.is_set(): break time.sleep(0.1) try: event = threading.Event() _callback_thread = threading.Thread(target=_process_calls, args=(_queue, callbacks, event), daemon=True) _callback_thread.start() res = self.driver.get_results() event.set() if _callback_thread: _callback_thread.join(timeout=60) finally: if hasattr(_queue, "shutdown"): _queue.shutdown() else: done_ref = _queue.actor.__ray_terminate__.remote() done, not_done = ray.wait([done_ref], timeout=5) if not_done: ray.kill(_queue.actor) self.driver.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes ' 'exited with non-zero ' 'status, thus causing the job to be terminated. ' 'The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code)) return_values = [ value for k, value in sorted(return_values, key=lambda kv: kv[0]) ] return return_values
def test_worker_replica_failure(serve_instance): serve.http_proxy.MAX_ACTOR_DEAD_RETRIES = 0 serve.init() serve.create_endpoint("replica_failure", "/replica_failure", methods=["GET"]) class Worker: # Assumes that two replicas are started. Will hang forever in the # constructor for any workers that are restarted. def __init__(self, path): self.should_hang = False if not os.path.exists(path): with open(path, "w") as f: f.write("1") else: with open(path, "r") as f: num = int(f.read()) with open(path, "w") as f: if num == 2: self.should_hang = True else: f.write(str(num + 1)) if self.should_hang: while True: pass def __call__(self): pass temp_path = tempfile.gettempdir() + "/" + serve.utils.get_random_letters() serve.create_backend(Worker, "replica_failure", temp_path) backend_config = serve.get_backend_config("replica_failure") backend_config.num_replicas = 2 serve.set_backend_config("replica_failure", backend_config) serve.link("replica_failure", "replica_failure") # Wait until both replicas have been started. responses = set() while len(responses) == 1: responses.add( request_with_retries("/replica_failure", timeout=0.1).text) time.sleep(0.1) # Kill one of the replicas. handles = _get_worker_handles("replica_failure") assert len(handles) == 2 ray.kill(handles[0]) # Check that the other replica still serves requests. for _ in range(10): while True: try: # The timeout needs to be small here because the request to # the restarting worker will hang. request_with_retries("/replica_failure", timeout=0.1) break except TimeoutError: time.sleep(0.1)
def _kill_routers(): routers = ray.get(serve.api._get_controller().get_routers.remote()) for router in routers.values(): ray.kill(router, no_restart=False)
def _kill_router(): [router] = ray.get(serve.api._get_master_actor().get_router.remote()) ray.kill(router)
def test_detached_placement_group(ray_start_cluster): cluster = ray_start_cluster for _ in range(2): cluster.add_node(num_cpus=3) cluster.wait_for_nodes() info = ray.init(address=cluster.address) # Make sure detached placement group will alive when job dead. driver_code = f""" import ray ray.init(address="{info["redis_address"]}") pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached") ray.get(pg.ready()) @ray.remote(num_cpus=1) class Actor: def ready(self): return True for bundle_index in range(2): actor = Actor.options(lifetime="detached", placement_group=pg, placement_group_bundle_index=bundle_index).remote() ray.get(actor.ready.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False def assert_alive_num_pg(expected_num_pg): alive_num_pg = 0 for _, placement_group_info in ray.util.placement_group_table().items( ): if placement_group_info["state"] == "CREATED": alive_num_pg += 1 return alive_num_pg == expected_num_pg def assert_alive_num_actor(expected_num_actor): alive_num_actor = 0 for actor_info in ray.actors().values(): if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE: alive_num_actor += 1 return alive_num_actor == expected_num_actor wait_for_condition(is_job_done) assert assert_alive_num_pg(1) assert assert_alive_num_actor(2) # Make sure detached placement group will alive when its creator which # is detached actor dead. # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor_with_detached_pg(self): # Create placement group which is detached. pg = ray.util.placement_group([{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", lifetime="detached", name="detached_pg") ray.get(pg.ready()) # Schedule nested actor with the placement group. for bundle_index in range(2): actor = NestedActor.options( placement_group=pg, placement_group_bundle_index=bundle_index, lifetime="detached").remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options(lifetime="detached").remote() ray.get(a.ready.remote()) # 1 parent actor and 2 children actor. ray.get(a.schedule_nested_actor_with_detached_pg.remote()) # Kill an actor and wait until it is killed. ray.kill(a) try: ray.get(a.ready.remote()) except ray.exceptions.RayActorError: pass # We should have 2 alive pgs and 4 alive actors. assert assert_alive_num_pg(2) assert assert_alive_num_actor(4)
def test_threaded_actor_creation_and_kill(ray_start_cluster): """Test the scenario where the threaded actors are created and killed.""" cluster = ray_start_cluster NUM_CPUS_PER_NODE = 3 NUM_NODES = 2 for _ in range(NUM_NODES): cluster.add_node(num_cpus=NUM_CPUS_PER_NODE) ray.init(address=cluster.address) @ray.remote(num_cpus=0) class ThreadedActor: def __init__(self): self.received = [] self.lock = threading.Lock() def add(self, seqno): time.sleep(1) with self.lock: self.received.append(seqno) def get_all(self): with self.lock: return self.received def ready(self): pass def terminate(self): ray.actor.exit_actor() # - Create threaded actors # - Submit many tasks. # - Ungracefully kill them in the middle. for _ in range(10): actors = [ ThreadedActor.options(max_concurrency=10).remote() for _ in range(NUM_NODES * NUM_CPUS_PER_NODE) ] ray.get([actor.ready.remote() for actor in actors]) for _ in range(10): for actor in actors: actor.add.remote(1) time.sleep(0.5) for actor in actors: ray.kill(actor) ensure_cpu_returned(NUM_NODES * NUM_CPUS_PER_NODE) # - Create threaded actors # - Submit many tasks. # - Gracefully kill them in the middle. for _ in range(10): actors = [ ThreadedActor.options(max_concurrency=10).remote() for _ in range(NUM_NODES * NUM_CPUS_PER_NODE) ] ray.get([actor.ready.remote() for actor in actors]) for _ in range(10): for actor in actors: actor.add.remote(1) time.sleep(0.5) for actor in actors: actor.terminate.remote() ensure_cpu_returned(NUM_NODES * NUM_CPUS_PER_NODE)
def _kill_routers(client): routers = ray.get(client._controller.get_routers.remote()) for router in routers.values(): ray.kill(router, no_restart=False)