def test_single_cpu_cancel(shutdown_only, use_force): ray.init(num_cpus=1) signaler = SignalActor.remote() @ray.remote def wait_for(t): return ray.get(t[0]) obj1 = wait_for.remote([signaler.wait.remote()]) obj2 = wait_for.remote([obj1]) obj3 = wait_for.remote([obj2]) indep = wait_for.remote([signaler.wait.remote()]) assert len(ray.wait([obj3], timeout=0.1)[0]) == 0 ray.cancel(obj3, force=use_force) with pytest.raises(valid_exceptions(use_force)): ray.get(obj3) ray.cancel(obj1, force=use_force) for d in [obj1, obj2]: with pytest.raises(valid_exceptions(use_force)): ray.get(d) signaler.send.remote() ray.get(indep)
async def test_job_runs_with_no_resources_available(job_manager): script_path = _driver_script_path("consume_one_cpu.py") hang_signal_actor = SignalActor.remote() @ray.remote(num_cpus=ray.available_resources()["CPU"]) def consume_all_cpus(): ray.get(hang_signal_actor.wait.remote()) # Start a hanging task that consumes all CPUs. hanging_ref = consume_all_cpus.remote() try: # Check that the job starts up properly even with no CPUs available. # The job won't exit until it has a CPU available because it waits for # a task. job_id = job_manager.submit_job(entrypoint=f"python {script_path}") await async_wait_for_condition(check_job_running, job_manager=job_manager, job_id=job_id) await async_wait_for_condition( lambda: "Hanging..." in job_manager.get_job_logs(job_id)) # Signal the hanging task to exit and release its CPUs. ray.get(hang_signal_actor.send.remote()) # Check the job succeeds now that resources are available. await async_wait_for_condition(check_job_succeeded, job_manager=job_manager, job_id=job_id) await async_wait_for_condition( lambda: "Success!" in job_manager.get_job_logs(job_id)) finally: # Just in case the test fails. ray.cancel(hanging_ref)
def _apply( self, fn: Any, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, ) -> BlockList: context = DatasetContext.get_current() # Handle empty datasets. if block_list.initial_num_blocks() == 0: return block_list blocks = block_list.get_blocks_with_metadata() map_bar = ProgressBar("Map Progress", total=len(blocks)) if context.block_splitting_enabled: map_block = cached_remote_fn(_map_block_split).options(**remote_args) refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] else: map_block = cached_remote_fn(_map_block_nosplit).options( **dict(remote_args, num_returns=2) ) all_refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] data_refs = [r[0] for r in all_refs] refs = [r[1] for r in all_refs] # Release input block references. if clear_input_blocks: del blocks block_list.clear() # Common wait for non-data refs. try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] if context.block_splitting_enabled: for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block, metadata in zip(data_refs, results): new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def test_comprehensive(ray_start_regular, use_force): signaler = SignalActor.remote() @ray.remote def wait_for(t): ray.get(t[0]) return "Result" @ray.remote def combine(a, b): return str(a) + str(b) a = wait_for.remote([signaler.wait.remote()]) b = wait_for.remote([signaler.wait.remote()]) combo = combine.remote(a, b) a2 = wait_for.remote([a]) assert len(ray.wait([a, b, a2, combo], timeout=1)[0]) == 0 ray.cancel(a, force=use_force) with pytest.raises(valid_exceptions(use_force)): ray.get(a, timeout=10) with pytest.raises(valid_exceptions(use_force)): ray.get(a2, timeout=40) signaler.send.remote() with pytest.raises(valid_exceptions(use_force)): ray.get(combo)
def test_recursive_cancel(shutdown_only, use_force): ray.init(num_cpus=4) @ray.remote(num_cpus=1) def inner(): while True: time.sleep(0.1) @ray.remote(num_cpus=1) def outer(): x = [inner.remote()] print(x) while True: time.sleep(0.1) @ray.remote(num_cpus=4) def many_resources(): return 300 outer_fut = outer.remote() many_fut = many_resources.remote() with pytest.raises(GetTimeoutError): ray.get(many_fut, timeout=1) ray.cancel(outer_fut) with pytest.raises(valid_exceptions(use_force)): ray.get(outer_fut, timeout=10) assert ray.get(many_fut, timeout=30)
def test_cancel_multiple_dependents(ray_start_regular, use_force): signaler = SignalActor.remote() @ray.remote def wait_for(t): return ray.get(t[0]) head = wait_for.remote([signaler.wait.remote()]) deps = [] for _ in range(3): deps.append(wait_for.remote([head])) assert len(ray.wait([head], timeout=0.1)[0]) == 0 ray.cancel(head, force=use_force) for d in deps: with pytest.raises(valid_exceptions(use_force)): ray.get(d) head2 = wait_for.remote([signaler.wait.remote()]) deps2 = [] for _ in range(3): deps2.append(wait_for.remote([head])) for d in deps2: ray.cancel(d, force=use_force) for d in deps2: with pytest.raises(valid_exceptions(use_force)): ray.get(d) signaler.send.remote() ray.get(head2)
def Terminate(self, req, context=None): if req.WhichOneof("terminate_type") == "task_object": try: object_ref = self.object_refs[req.client_id][ req.task_object.id] with disable_client_hook(): ray.cancel( object_ref, force=req.task_object.force, recursive=req.task_object.recursive, ) except Exception as e: return_exception_in_context(e, context) elif req.WhichOneof("terminate_type") == "actor": try: actor_ref = self.actor_refs[req.actor.id] with disable_client_hook(): ray.kill(actor_ref, no_restart=req.actor.no_restart) except Exception as e: return_exception_in_context(e, context) else: raise RuntimeError( "Client requested termination without providing a valid " "terminate_type") return ray_client_pb2.TerminateResponse(ok=True)
def test_fast(shutdown_only, use_force): ray.init(num_cpus=2) @ray.remote def fast(y): return y signaler = SignalActor.remote() ids = list() for _ in range(100): x = fast.remote("a") ray.cancel(x) ids.append(x) @ray.remote def wait_for(y): return y sig = signaler.wait.remote() for _ in range(5000): x = wait_for.remote(sig) ids.append(x) for idx in range(100, 5100): if random.random() > 0.95: ray.cancel(ids[idx]) signaler.send.remote() for obj_id in ids: try: ray.get(obj_id) except Exception as e: assert isinstance(e, valid_exceptions(use_force))
def deploy_app( self, import_path: str, runtime_env: Dict, deployment_override_options: List[Dict], ) -> None: """Kicks off a task that deploys a Serve application. Cancels any previous in-progress task that is deploying a Serve application. Args: import_path: Serve deployment graph's import path runtime_env: runtime_env to run the deployment graph in deployment_override_options: All dictionaries should contain argument-value options that can be passed directly into a set_options() call. Overrides deployment options set in the graph itself. """ if self.config_deployment_request_ref is not None: ray.cancel(self.config_deployment_request_ref) logger.info("Received new config deployment request. Cancelling " "previous request.") self.config_deployment_request_ref = run_graph.options( runtime_env=runtime_env).remote(import_path, runtime_env, deployment_override_options) self.deployment_timestamp = time.time()
def cancel(self) -> None: """Cancel the running workflow.""" for fut, workflow_ref in self._state.running_frontier.items(): fut.cancel() try: ray.cancel(workflow_ref.ref, force=True) except Exception: pass
def finish(self, result_handler=None): try: while self._waiting: ready = self._wait() if result_handler: for filename_id in ready: result_handler(ray.get(filename_id)) except KeyboardInterrupt: for result_id in self._waiting: ray.cancel(result_id)
def _stop_workers(self): if self._worker_tasks: logging.info("Terminating workers") for index, task in self._worker_tasks.items(): try: ray.cancel(task, force=True) except Exception: pass self._worker_tasks = {} self._workers = {}
def stop(self, force=True): for _ in range(len(self._workers)): self._r_client.lpush(TASK_QUEUE_KEY, serialize(None)) if not force: raise NotImplementedError for w_id in self._workers: ray.cancel(w_id, force=True) for key in [TASK_QUEUE_KEY, RESULTS_QUEUE_KEY, DATA_KEY]: self._r_client.delete(key)
async def force_worker_checkpoint(self): if self._worker_tasks: logging.info("Checkpoint needed: stopping workers") async with self._checkpoint_lock: if self._running: for index, task in self._worker_tasks.items(): try: ray.cancel(task, force=False) except Exception as e: raise e self._running = False
def stop_computation(): """ 停止 Ray 节点的计算任务 """ print('stop_computation()') global result_ids global is_canceled is_canceled = True for result_id in result_ids: ray.cancel(result_id, force=True) result_ids = [] is_canceled = False
def test_object_store_memory_reporting_task(): @ray.remote def f(x): time.sleep(60) try: ray.init(num_cpus=1, object_store_memory=500 * MB) wait_for_condition(lambda: object_store_memory(500 * MB)) x1 = f.remote(np.zeros(150 * 1024 * 1024, dtype=np.uint8)) wait_for_condition(lambda: object_store_memory(350 * MB)) ray.cancel(x1, force=True) wait_for_condition(lambda: object_store_memory(500 * MB)) finally: ray.shutdown()
def deploy_app( self, config: ServeApplicationSchema, update_time: bool = True ) -> None: """Kicks off a task that deploys a Serve application. Cancels any previous in-progress task that is deploying a Serve application. Args: config: Contains the following: import_path: Serve deployment graph's import path runtime_env: runtime_env to run the deployment graph in deployment_override_options: Dictionaries that contain argument-value options that can be passed directly into a set_options() call. Overrides deployment options set in the graph's code itself. update_time: Whether to update the deployment_timestamp. """ if update_time: self.deployment_timestamp = time.time() config_dict = config.dict(exclude_unset=True) self.kv_store.put( CONFIG_CHECKPOINT_KEY, pickle.dumps((self.deployment_timestamp, config_dict)), ) if self.config_deployment_request_ref is not None: ray.cancel(self.config_deployment_request_ref) logger.info( "Received new config deployment request. Cancelling " "previous request." ) deployment_override_options = config.dict( by_alias=True, exclude_unset=True ).get("deployments", []) self.config_deployment_request_ref = run_graph.options( runtime_env=config.runtime_env ).remote(config.import_path, config.runtime_env, deployment_override_options)
def Terminate(self, request, context=None): if request.WhichOneof("terminate_type") == "task_object": try: object_ref = cloudpickle.loads(request.task_object.handle) ray.cancel(object_ref, force=request.task_object.force, recursive=request.task_object.recursive) except Exception as e: return_exception_in_context(e, context) elif request.WhichOneof("terminate_type") == "actor": try: actor_ref = cloudpickle.loads(request.actor.handle) ray.kill(actor_ref, no_restart=request.actor.no_restart) except Exception as e: return_exception_in_context(e, context) else: raise RuntimeError( "Client requested termination without providing a valid " "terminate_type") return ray_client_pb2.TerminateResponse(ok=True)
def test_stress(shutdown_only, use_force): ray.init(num_cpus=1) @ray.remote def infinite_sleep(y): if y: while True: time.sleep(1 / 10) first = infinite_sleep.remote(True) sleep_or_no = [random.randint(0, 1) for _ in range(100)] tasks = [infinite_sleep.remote(i) for i in sleep_or_no] cancelled = set() for t in tasks: if random.random() > 0.5: ray.cancel(t, force=use_force) cancelled.add(t) ray.cancel(first, force=use_force) cancelled.add(first) for done in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(done, timeout=120) for indx, t in enumerate(tasks): if sleep_or_no[indx]: ray.cancel(t, force=use_force) cancelled.add(t) if t in cancelled: with pytest.raises(valid_exceptions(use_force)): ray.get(t, timeout=120) else: ray.get(t, timeout=120)
def generate_networks(cfg, overwrite=False): seed_start, seed_end = cfg.get_config( "meta/random_seeds/start"), cfg.get_config("meta/random_seeds/end") seeds = range(seed_start, seed_end + 1) initialise_ray(cfg) try: futures = [ generate_one_network.remote(cfg, seed, overwrite) for seed in seeds ] wait_for_completion(futures, len(seeds)) except Exception as e: [ray.cancel(process) for process in futures] print(e) except KeyboardInterrupt as k: [ray.cancel(process) for process in futures] try: sys.exit(k) except SystemExit: os._exit(k)
def test_fast(shutdown_only, use_force): ray.init(num_cpus=2) @ray.remote def fast(y): return y signaler = SignalActor.remote() ids = list() for _ in range(100): x = fast.remote("a") # NOTE If a non-force Cancellation is attempted in the time # between a worker receiving a task and the worker executing # that task (specifically the python execution), Cancellation # can fail. time.sleep(0.1) ray.cancel(x, force=use_force) ids.append(x) @ray.remote def wait_for(y): return y sig = signaler.wait.remote() for _ in range(5000): x = wait_for.remote(sig) ids.append(x) for idx in range(100, 5100): if random.random() > 0.95: ray.cancel(ids[idx], force=use_force) signaler.send.remote() for i, obj_ref in enumerate(ids): try: ray.get(obj_ref, timeout=120) except Exception as e: assert isinstance( e, valid_exceptions(use_force)), f"Failure on iteration: {i}"
def test_cancel_chain(ray_start_regular, use_force): signaler = SignalActor.remote() @ray.remote def wait_for(t): return ray.get(t[0]) obj1 = wait_for.remote([signaler.wait.remote()]) obj2 = wait_for.remote([obj1]) obj3 = wait_for.remote([obj2]) obj4 = wait_for.remote([obj3]) assert len(ray.wait([obj1], timeout=0.1)[0]) == 0 ray.cancel(obj1, force=use_force) for ob in [obj1, obj2, obj3, obj4]: with pytest.raises(valid_exceptions(use_force)): ray.get(ob) signaler2 = SignalActor.remote() obj1 = wait_for.remote([signaler2.wait.remote()]) obj2 = wait_for.remote([obj1]) obj3 = wait_for.remote([obj2]) obj4 = wait_for.remote([obj3]) assert len(ray.wait([obj3], timeout=0.1)[0]) == 0 ray.cancel(obj3, force=use_force) for ob in [obj3, obj4]: with pytest.raises(valid_exceptions(use_force)): ray.get(ob) with pytest.raises(GetTimeoutError): ray.get(obj1, timeout=0.1) with pytest.raises(GetTimeoutError): ray.get(obj2, timeout=0.1) signaler2.send.remote() ray.get(obj1)
def apply(self, fn: Any, remote_args: dict, blocks: BlockList) -> BlockList: # Handle empty datasets. if blocks.initial_num_blocks() == 0: return blocks blocks = list(blocks.iter_blocks_with_metadata()) map_bar = ProgressBar("Map Progress", total=len(blocks)) map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**remote_args).remote(b, fn, m.input_files) for b, m in blocks ] try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def test_remote_cancel(ray_start_regular, use_force): signaler = SignalActor.remote() @ray.remote def wait_for(y): return ray.get(y[0]) @ray.remote def remote_wait(sg): return [wait_for.remote([sg[0]])] sig = signaler.wait.remote() outer = remote_wait.remote([sig]) inner = ray.get(outer)[0] with pytest.raises(GetTimeoutError): ray.get(inner, timeout=1) ray.cancel(inner, force=use_force) with pytest.raises(valid_exceptions(use_force)): ray.get(inner, timeout=10)
def test_worker(ray_fix): controller = MockedController.options(name="AdaptDLController").remote() rank = 0 replicas = 2 restarts = 3 checkpoint = None offset = 50 path = "ray/adaptdl_ray/aws/_example_worker.py" argv = ["--arg1", "value", "--arg2", "value"] worker_task = run_adaptdl.remote("test_key", "test_uid", rank, replicas, restarts, checkpoint, offset, path, argv) # can't cancel with force=True time.sleep(10) ray.cancel(worker_task, force=False) print("canceling") time.sleep(10) checkpoint = ray.get(controller.get_checkpoint.remote()) print(checkpoint) assert ('file.txt' in checkpoint) ray.cancel(worker_task, force=False) rank = 1 replicas = 2 restarts = 4 offset = 50 worker_task = run_adaptdl.remote("test_key_2", "test_uid_2", rank, replicas, restarts, checkpoint, offset, path, argv) time.sleep(10) assert (os.path.exists("/tmp/checkpoint-test_uid_2-1/file.txt")) with open("/tmp/checkpoint-test_uid_2-1/file.txt", "rb") as f: result = int(f.read()) assert (result == 5)
def test_pipeline_splitting_has_no_spilling(shutdown_only): # The object store is about 800MiB. ctx = ray.init(num_cpus=1, object_store_memory=800e6) # The size of dataset is 50000*(80*80*4)*8B, about 10GiB, 50MiB/block. ds = ray.data.range_tensor(50000, shape=(80, 80, 4), parallelism=200) # 2 blocks/window. p = ds.window(bytes_per_window=100 * 1024 * 1024).repeat() p1, p2 = p.split(2) @ray.remote def consume(p): for batch in p.iter_batches(): pass tasks = [consume.remote(p1), consume.remote(p2)] try: # Run it for 20 seconds. ray.get(tasks, timeout=20) except Exception: for t in tasks: ray.cancel(t, force=True) meminfo = memory_summary(ctx.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: # Handle empty datasets. if len(blocks) == 0: return blocks map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**kwargs).remote(b, m, fn) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) new_metadata = list(new_metadata) try: new_metadata = map_bar.fetch_until_complete(new_metadata) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in new_metadata: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in new_metadata: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None return BlockList(list(new_blocks), list(new_metadata))
def test_errors_before_initializing_ray(): @ray.remote def f(): pass @ray.remote class Foo: pass api_methods = [ f.remote, Foo.remote, ray.actors, lambda: ray.cancel(None), # Not valid API usage. lambda: ray.get([]), lambda: ray.get_actor("name"), ray.get_gpu_ids, ray.get_resource_ids, ray.get_webui_url, ray.jobs, lambda: ray.kill(None), # Not valid API usage. ray.nodes, ray.objects, lambda: ray.put(1), lambda: ray.wait([]) ] def test_exceptions_raised(): for api_method in api_methods: print(api_method) with pytest.raises(ray.exceptions.RayConnectionError, match="Ray has not been started yet."): api_method() test_exceptions_raised() # Make sure that the exceptions are still raised after Ray has been # started and shutdown. ray.init(num_cpus=0) ray.shutdown() test_exceptions_raised()
timeit.timeit(lambda: in_order(), number=1) # In[ ]: #tag::handle_bad_futures[] futures = list(map(lambda x: remote_task.remote(x), [1, threading.TIMEOUT_MAX])) # While we still have pending futures while len(futures) > 0: # In practice 10 seconds is too short for most cases. ready_futures, rest_futures = ray.wait(futures, timeout=10, num_returns=1) # If we get back anything less than num_returns if len(ready_futures) < 1: print(f"Timed out on {rest_futures}") # You don't _have to cancel_ but if you've your task is using a lot of resources ray.cancel(*rest_futures) # You should break since you exceeded your timeout break for id in ready_futures: print(f'completed value {id}, result {ray.get(id)}') futures = rest_futures #end::handle_bad_futures[] # In[ ]: remote_task.remote(1) # In[ ]: #tag::ray_remote_seq[]
def cancel(self, obj, *, force=False, recursive=True): return ray.cancel(obj, force=force, recursive=recursive)