def actor_handle(self) -> ActorHandle: return ray.get_actor(self._actor_name)
def test_whether_worker_leaked_when_task_finished_with_errors( ray_start_regular): driver_template = """ import ray import os import ray import numpy as np import time ray.init(address="{address}", namespace="test") # The util actor to store the pid cross jobs. @ray.remote class PidStoreActor: def __init(self): self._pid = None def put(self, pid): self._pid = pid return True def get(self): return self._pid def _store_pid_helper(): try: pid_store_actor = ray.get_actor("pid-store", "test") except Exception: pid_store_actor = PidStoreActor.options( name="pid-store", lifetime="detached").remote() assert ray.get(pid_store_actor.put.remote(os.getpid())) @ray.remote def normal_task(large1, large2): # Record the pid of this normal task. _store_pid_helper() time.sleep(60 * 60) return "normaltask" large = ray.put(np.zeros(100 * 2**10, dtype=np.int8)) obj = normal_task.remote(large, large) print(ray.get(obj)) """ driver_script = driver_template.format( address=ray_start_regular["address"]) driver_proc = run_string_as_driver_nonblocking(driver_script) try: driver_proc.wait(10) except Exception: pass def get_normal_task_pid(): try: pid_store_actor = ray.get_actor("pid-store", "test") return ray.get(pid_store_actor.get.remote()) except Exception: return None wait_for_condition(lambda: get_normal_task_pid() is not None, 10) pid_store_actor = ray.get_actor("pid-store", "test") normal_task_pid = ray.get(pid_store_actor.get.remote()) assert normal_task_pid is not None normal_task_proc = psutil.Process(normal_task_pid) print("killing normal task process, pid =", normal_task_pid) normal_task_proc.send_signal(signal.SIGTERM) def normal_task_was_reconstructed(): curr_pid = get_normal_task_pid() return curr_pid is not None and curr_pid != normal_task_pid wait_for_condition(lambda: normal_task_was_reconstructed(), 10) driver_proc.send_signal(signal.SIGTERM) # Sleep here to make sure raylet has triggered cleaning up # the idle workers. wait_for_condition(lambda: not psutil.pid_exists(normal_task_pid), 10)
def ray_dask_get(dsk, keys, **kwargs): """ A Dask-Ray scheduler. This scheduler will send top-level (non-inlined) Dask tasks to a Ray cluster for execution. The scheduler will wait for the tasks to finish executing, fetch the results, and repackage them into the appropriate Dask collections. This particular scheduler uses a threadpool to submit Ray tasks. This can be passed directly to `dask.compute()`, as the scheduler: >>> dask.compute(obj, scheduler=ray_dask_get) You can override the currently active global Dask-Ray callbacks (e.g. supplied via a context manager), the number of threads to use when submitting the Ray tasks, or the threadpool used to submit Ray tasks: >>> dask.compute( obj, scheduler=ray_dask_get, ray_callbacks=some_ray_dask_callbacks, num_workers=8, pool=some_cool_pool, ) Args: dsk (Dict): Dask graph, represented as a task DAG dictionary. keys (List[str]): List of Dask graph keys whose values we wish to compute and return. ray_callbacks (Optional[list[callable]]): Dask-Ray callbacks. num_workers (Optional[int]): The number of worker threads to use in the Ray task submission traversal of the Dask graph. pool (Optional[ThreadPool]): A multiprocessing threadpool to use to submit Ray tasks. Returns: Computed values corresponding to the provided keys. """ num_workers = kwargs.pop("num_workers", None) pool = kwargs.pop("pool", None) # We attempt to reuse any other thread pools that have been created within # this thread and with the given number of workers. We reuse a global # thread pool if num_workers is not given and we're in the main thread. global default_pool thread = threading.current_thread() if pool is None: with pools_lock: if num_workers is None and thread is main_thread: if default_pool is None: default_pool = ThreadPool(CPU_COUNT) atexit.register(default_pool.close) pool = default_pool elif thread in pools and num_workers in pools[thread]: pool = pools[thread][num_workers] else: pool = ThreadPool(num_workers) atexit.register(pool.close) pools[thread][num_workers] = pool ray_callbacks = kwargs.pop("ray_callbacks", None) persist = kwargs.pop("ray_persist", False) enable_progress_bar = kwargs.pop("_ray_enable_progress_bar", None) with local_ray_callbacks(ray_callbacks) as ray_callbacks: # Unpack the Ray-specific callbacks. ( ray_presubmit_cbs, ray_postsubmit_cbs, ray_pretask_cbs, ray_posttask_cbs, ray_postsubmit_all_cbs, ray_finish_cbs, ) = unpack_ray_callbacks(ray_callbacks) # NOTE: We hijack Dask's `get_async` function, injecting a different # task executor. object_refs = get_async( _apply_async_wrapper( pool.apply_async, _rayify_task_wrapper, ray_presubmit_cbs, ray_postsubmit_cbs, ray_pretask_cbs, ray_posttask_cbs, ), len(pool._pool), dsk, keys, get_id=_thread_get_id, pack_exception=pack_exception, **kwargs, ) if ray_postsubmit_all_cbs is not None: for cb in ray_postsubmit_all_cbs: cb(object_refs, dsk) # NOTE: We explicitly delete the Dask graph here so object references # are garbage-collected before this function returns, i.e. before all # Ray tasks are done. Otherwise, no intermediate objects will be # cleaned up until all Ray tasks are done. del dsk if persist: result = object_refs else: pb_actor = None if enable_progress_bar: pb_actor = ray.get_actor("_dask_on_ray_pb") result = ray_get_unpack(object_refs, progress_bar_actor=pb_actor) if ray_finish_cbs is not None: for cb in ray_finish_cbs: cb(result) # cleanup pools associated with dead threads. with pools_lock: active_threads = set(threading.enumerate()) if thread is not main_thread: for t in list(pools): if t not in active_threads: for p in pools.pop(t).values(): p.close() return result
def test_get_non_existing_named_actor(ray_start_regular_shared): with pytest.raises(ValueError): _ = ray.get_actor("non_existing_actor")
def get_management_actor() -> "ActorHandle": return ray.get_actor(common.MANAGEMENT_ACTOR_NAME, namespace=common.MANAGEMENT_ACTOR_NAMESPACE)
def test_detached_actor(ray_start_regular): @ray.remote class DetachedActor: def ping(self): return "pong" with pytest.raises(TypeError): DetachedActor._remote(lifetime="detached", name=1) with pytest.raises(ValueError, match="Actor name cannot be an empty string"): DetachedActor._remote(lifetime="detached", name="") with pytest.raises(ValueError): DetachedActor._remote(lifetime="detached", name="hi", namespace="") with pytest.raises(TypeError): DetachedActor._remote(lifetime="detached", name="hi", namespace=2) d = DetachedActor._remote(lifetime="detached", name="d_actor") assert ray.get(d.ping.remote()) == "pong" with pytest.raises(ValueError, match="Please use a different name"): DetachedActor._remote(lifetime="detached", name="d_actor") address = ray_start_regular["address"] get_actor_name = "d_actor" create_actor_name = "DetachedActor" driver_script = """ import ray ray.init(address="{}", namespace="default_test_namespace") name = "{}" assert ray.util.list_named_actors() == [name] existing_actor = ray.get_actor(name) assert ray.get(existing_actor.ping.remote()) == "pong" @ray.remote def foo(): return "bar" @ray.remote class NonDetachedActor: def foo(self): return "bar" @ray.remote class DetachedActor: def ping(self): return "pong" def foobar(self): actor = NonDetachedActor.remote() return ray.get([foo.remote(), actor.foo.remote()]) actor = DetachedActor._remote(lifetime="detached", name="{}") ray.get(actor.ping.remote()) """.format( address, get_actor_name, create_actor_name ) run_string_as_driver(driver_script) assert len(ray.util.list_named_actors()) == 2 assert get_actor_name in ray.util.list_named_actors() assert create_actor_name in ray.util.list_named_actors() detached_actor = ray.get_actor(create_actor_name) assert ray.get(detached_actor.ping.remote()) == "pong" # Verify that a detached actor is able to create tasks/actors # even if the driver of the detached actor has exited. assert ray.get(detached_actor.foobar.remote()) == ["bar", "bar"]
def test_multiple_routers(ray_cluster): cluster = ray_cluster head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, serve.api._global_client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
def start( detached: bool = False, http_host: Optional[str] = DEFAULT_HTTP_HOST, http_port: int = DEFAULT_HTTP_PORT, http_middlewares: List[Any] = [], http_options: Optional[Union[dict, HTTPOptions]] = None, ) -> Client: """Initialize a serve instance. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the instance will instead persist until client.shutdown() is called and clients to it can be connected using serve.connect(). This is only relevant if connecting to a long-running Ray cluster (e.g., with address="auto"). Args: detached (bool): Whether not the instance should be detached from this script. http_host (Optional[str]): Deprecated, use http_options instead. http_port (int): Deprecated, use http_options instead. http_middlewares (list): Deprecated, use http_options instead. http_options (Optional[Dict, serve.HTTPOptions]): Configuration options for HTTP proxy. You can pass in a dictionary or HTTPOptions object with fields: - host(str, None): Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". - port(int): Port for HTTP server. Defaults to 8000. - middlewares(list): A list of Starlette middlewares that will be applied to the HTTP servers in the cluster. - location(str, serve.config.DeploymentMode): The deployment location of HTTP servers: - "HeadOnly": start one HTTP server on the head node. Serve assumes the head node is the node you executed serve.start on. This is the default. - "EveryNode": start one HTTP server per node. - "NoServer" or None: disable HTTP server. """ if ((http_host != DEFAULT_HTTP_HOST) or (http_port != DEFAULT_HTTP_PORT) or (len(http_middlewares) != 0)): if http_options is not None: raise ValueError( "You cannot specify both `http_options` and any of the " "`http_host`, `http_port`, and `http_middlewares` arguments. " "`http_options` is preferred.") else: warn( "`http_host`, `http_port`, `http_middlewares` are deprecated. " "Please use serve.start(http_options={'host': ..., " "'port': ..., middlewares': ...}) instead.", DeprecationWarning, ) # Initialize ray if needed. if not ray.is_initialized(): ray.init() register_custom_serializers() # Try to get serve controller if it exists if detached: controller_name = SERVE_CONTROLLER_NAME try: ray.get_actor(controller_name) raise RayServeException("Called serve.start(detached=True) but a " "detached instance is already running. " "Please use serve.connect() to connect to " "the running instance instead.") except ValueError: pass else: controller_name = format_actor_name(SERVE_CONTROLLER_NAME, get_random_letters()) if isinstance(http_options, dict): http_options = HTTPOptions.parse_obj(http_options) if http_options is None: http_options = HTTPOptions( host=http_host, port=http_port, middlewares=http_middlewares) controller = ServeController.options( name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, # Pin Serve controller on the head node. resources={ get_current_node_resource_key(): 0.01 }, ).remote( controller_name, http_options, detached=detached, ) proxy_handles = ray.get(controller.get_http_proxies.remote()) if len(proxy_handles) > 0: try: ray.get( [handle.ready.remote() for handle in proxy_handles.values()], timeout=HTTP_PROXY_TIMEOUT, ) except ray.exceptions.GetTimeoutError: raise TimeoutError( "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.") client = Client(controller, controller_name, detached=detached) _set_global_client(client) return client
async def handler(self, block: bool): if block: signal = ray.get_actor(signal_name) await signal.wait.remote() return f"1|{os.getpid()}"
async def v1(request): if request.query_params["block"] == "True": signal = ray.get_actor(signal_name) await signal.wait.remote() return f"1|{os.getpid()}"
def get_actor_count(self, name): actor = ray.get_actor(name) return ray.get(actor.inc_and_get.remote())
def test_serializing_exceptions(ray_start_regular_shared): with ray_start_client_server() as ray: with pytest.raises(ValueError): ray.get_actor("abc")
from seesaw.memory_cache import ReferenceCache import ray import argparse if __name__ == "__main__": parser = argparse.ArgumentParser( description="control the data cache. calling it again will restart the cache" ) # parser.add_argument('--restart', type=int, action='store_true', help='restart the cache') args = parser.parse_args() ray.init("auto", namespace="seesaw") actor_name = "actor#cache" try: oldh = ray.get_actor(actor_name) print("found old cache actor, destroying it") ray.kill(oldh) except: pass # no actor to kill print("starting new cache actor") h = ( ray.remote(ReferenceCache) .options(name=actor_name, num_cpus=1, lifetime="detached") .remote() ) r = h.ready.remote() ray.get(r) print("new cache actor ready")
def __init__(self): self.actor = ray.get_actor("a")
def do_run(name, concurrency=4): name = "actor_" + str(name) tasks = [getter.remote(name) for _ in range(concurrency)] result = ray.get(tasks) ray.kill(ray.get_actor(name)) # Cleanup return result
async def reconfigure(self, config): # Don't block when the replica is first created. if self.config is not None: signal = ray.get_actor(signal_name) ray.get(signal.wait.remote()) self.config = config
def check_name_available(name): try: ray.get_actor(name) return False except ValueError: return True
def _record_step_status(step_id: "StepID", status: "WorkflowStatus") -> None: workflow_id = workflow_context.get_current_workflow_id() workflow_manager = ray.get_actor(MANAGEMENT_ACTOR_NAME) ray.get( workflow_manager.update_step_status.remote(workflow_id, step_id, status))
async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None: """Recover the instance state from the provided checkpoint. Performs the following operations: 1) Deserializes the internal state from the checkpoint. 2) Pushes the latest configuration to the routers in case we crashed before updating them. 3) Starts/stops any worker replicas that are pending creation or deletion. NOTE: this requires that self.write_lock is already acquired and will release it before returning. """ assert self.write_lock.locked() start = time.time() logger.info("Recovering from checkpoint") # Load internal state from the checkpoint data. ( self.routes, router_node_ids, self.backends, self.traffic_policies, self.replicas, self.replicas_to_start, self.replicas_to_stop, self.backends_to_remove, self.endpoints_to_remove, ) = pickle.loads(checkpoint_bytes) for node_id in router_node_ids: router_name = format_actor_name(SERVE_PROXY_NAME, self.controller_name, node_id) self.routers[node_id] = ray.get_actor(router_name) # Fetch actor handles for all of the backend replicas in the system. # All of these workers are guaranteed to already exist because they # would not be written to a checkpoint in self.workers until they # were created. for backend_tag, replica_tags in self.replicas.items(): for replica_tag in replica_tags: replica_name = format_actor_name(replica_tag, self.controller_name) self.workers[backend_tag][replica_tag] = ray.get_actor( replica_name) # Push configuration state to the router. # TODO(edoakes): should we make this a pull-only model for simplicity? for endpoint, traffic_policy in self.traffic_policies.items(): await asyncio.gather(*[ router.set_traffic.remote(endpoint, traffic_policy) for router in self.routers.values() ]) for backend_tag, replica_dict in self.workers.items(): for replica_tag, worker in replica_dict.items(): await asyncio.gather(*[ router.add_new_worker.remote(backend_tag, replica_tag, worker) for router in self.routers.values() ]) for backend, info in self.backends.items(): await asyncio.gather(*[ router.set_backend_config.remote(backend, info.backend_config) for router in self.routers.values() ]) await self.broadcast_backend_config(backend) metadata = info.backend_config.internal_metadata if metadata.autoscaling_config is not None: self.autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, metadata.autoscaling_config) # Push configuration state to the routers. await asyncio.gather(*[ router.set_route_table.remote(self.routes) for router in self.routers.values() ]) # Start/stop any pending backend replicas. await self._start_pending_replicas() await self._stop_pending_replicas() # Remove any pending backends and endpoints. await self._remove_pending_backends() await self._remove_pending_endpoints() logger.info("Recovered from checkpoint in {:.3f}s".format(time.time() - start)) self.write_lock.release()
def main(args=None, model=None) -> GenerativeQAModule: parser = argparse.ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd()) parser = GenerativeQAModule.add_retriever_specific_args(parser) args = args or parser.parse_args() Path(args.output_dir).mkdir(exist_ok=True) named_actors = [] if args.distributed_retriever == "ray" and args.gpus > 1: if not is_ray_available(): raise RuntimeError("Please install Ray to use the Ray " "distributed retriever.") # Connect to an existing Ray cluster. try: ray.init(address=args.ray_address) except (ConnectionError, ValueError): logger.warning( "Connection to Ray cluster failed. Make sure a Ray" "cluster is running by either using Ray's cluster " "launcher (`ray up`) or by manually starting Ray on " "each node via `ray start --head` for the head node " "and `ray start --address='<ip address>:6379'` for " "additional nodes. See " "https://docs.ray.io/en/master/cluster/index.html " "for more info." ) raise # Create Ray actors only for rank 0. if ("LOCAL_RANK" not in os.environ or os.environ["LOCAL_RANK"] == 0) and ( "NODE_RANK" not in os.environ or os.environ["NODE_RANK"] == 0 ): remote_cls = ray.remote(RayRetriever) named_actors = [ remote_cls.options(name="retrieval_worker_{}".format(i)).remote() for i in range(args.num_retrieval_workers) ] else: logger.info( "Getting named actors for NODE_RANK {}, LOCAL_RANK {}".format( os.environ["NODE_RANK"], os.environ["LOCAL_RANK"] ) ) named_actors = [ray.get_actor("retrieval_worker_{}".format(i)) for i in range(args.num_retrieval_workers)] args.actor_handles = named_actors assert args.actor_handles == named_actors if model is None: model: GenerativeQAModule = GenerativeQAModule(args) dataset = Path(args.data_dir).name if ( args.logger_name == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var") ): training_logger = True # don't pollute wandb logs unnecessarily elif args.logger_name == "wandb": from pytorch_lightning.loggers import WandbLogger project = os.environ.get("WANDB_PROJECT", dataset) training_logger = WandbLogger(name=model.output_dir.name, project=project) elif args.logger_name == "wandb_shared": from pytorch_lightning.loggers import WandbLogger training_logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") es_callback = ( get_early_stopping_callback(model.val_metric, args.early_stopping_patience) if args.early_stopping_patience >= 0 else False ) trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric), early_stopping_callback=es_callback, logger=training_logger, accelerator=CustomAccel() if args.gpus > 1 else None, profiler=pl.profiler.AdvancedProfiler() if args.profile else None, ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model # test() without a model tests using the best checkpoint automatically trainer.test() return model
def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True
async def setup(self, name, controller_name): # Note: Several queues are used in the router # - When a request come in, it's placed inside its corresponding # endpoint_queue. # - The endpoint_queue is dequeued during flush operation, which moves # the queries to backend buffer_queue. Here we match a request # for an endpoint to a backend given some policy. # - The worker_queue is used to collect idle actor handle. These # handles are dequed during the second stage of flush operation, # which assign queries in buffer_queue to actor handle. self.name = name # -- Queues -- # # endpoint_name -> request queue # We use FIFO (left to right) ordering. The new items should be added # using appendleft. Old items should be removed via pop(). self.endpoint_queues: DefaultDict[deque[Query]] = defaultdict(deque) # backend_name -> worker replica tag queue self.worker_queues: DefaultDict[deque[str]] = defaultdict(deque) # backend_name -> worker payload queue self.backend_queues = defaultdict(deque) # -- Metadata -- # # endpoint_name -> traffic_policy self.traffic = dict() # backend_name -> backend_config self.backend_info = dict() # replica tag -> worker_handle self.replicas = dict() # backend_name -> replica_tag -> concurrent queries counter self.queries_counter = defaultdict(lambda: defaultdict(int)) # -- Synchronization -- # # This lock guarantee that only one flush operation can happen at a # time. Without the lock, multiple flush operation can pop from the # same buffer_queue and worker_queue and create deadlock. For example, # an operation holding the only query and the other flush operation # holding the only idle replica. Additionally, allowing only one flush # operation at a time simplifies design overhead for custom queuing and # batching policies. self.flush_lock = asyncio.Lock() # -- State Restoration -- # # Fetch the worker handles, traffic policies, and backend configs from # the controller. We use a "pull-based" approach instead of pushing # them from the controller so that the router can transparently recover # from failure. self.controller = ray.get_actor(controller_name) traffic_policies = ray.get( self.controller.get_traffic_policies.remote()) for endpoint, traffic_policy in traffic_policies.items(): await self.set_traffic(endpoint, traffic_policy) backend_dict = ray.get(self.controller.get_all_worker_handles.remote()) for backend_tag, replica_dict in backend_dict.items(): for replica_tag, worker in replica_dict.items(): await self.add_new_worker(backend_tag, replica_tag, worker) backend_configs = ray.get(self.controller.get_backend_configs.remote()) for backend, backend_config in backend_configs.items(): await self.set_backend_config(backend, backend_config) # -- Metrics Registration -- # self.num_router_requests = metrics.Count( "num_router_requests", "Number of requests processed by the router.", "requests", ["endpoint"]) self.num_error_endpoint_requests = metrics.Count( "num_error_endpoint_requests", ("Number of requests that errored when getting results " "for the endpoint."), "requests", ["endpoint"]) self.num_error_backend_requests = metrics.Count( "num_error_backend_requests", ("Number of requests that errored when getting result " "from the backend."), "requests", ["backend"]) self.backend_queue_size = metrics.Gauge( "backend_queued_queries", "Current number of queries queued in the router for a backend", "requests", ["backend"]) asyncio.get_event_loop().create_task(self.report_queue_lengths())
def actor_removed(): try: ray.get_actor("hi") return False except ValueError: return True
def get_from_ray(idx, redis_address, redis_password, idx_to_store_name): init_ray_if_not(redis_address, redis_password) local_store_handle = ray.get_actor(idx_to_store_name[idx]) partition = ray.get(local_store_handle.get_partition.remote(idx)) return partition
def start(detached: bool = False, http_host: str = DEFAULT_HTTP_HOST, http_port: int = DEFAULT_HTTP_PORT, http_middlewares: List[Any] = []) -> Client: """Initialize a serve instance. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the instance will instead persist until client.shutdown() is called and clients to it can be connected using serve.connect(). This is only relevant if connecting to a long-running Ray cluster (e.g., with address="auto"). Args: detached (bool): Whether not the instance should be detached from this script. http_host (str): Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". One HTTP server will be started on each node in the Ray cluster. To not start HTTP servers, set this to None. http_port (int): Port for HTTP server. Defaults to 8000. http_middlewares (list): A list of Starlette middlewares that will be applied to the HTTP servers in the cluster. """ # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve controller if it exists if detached: controller_name = SERVE_CONTROLLER_NAME try: ray.get_actor(controller_name) raise RayServeException("Called serve.start(detached=True) but a " "detached instance is already running. " "Please use serve.connect() to connect to " "the running instance instead.") except ValueError: pass else: controller_name = format_actor_name(SERVE_CONTROLLER_NAME, get_random_letters()) controller = ServeController.options( name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, ).remote(controller_name, HTTPConfig(http_host, http_port, http_middlewares), detached=detached) if http_host is not None: futures = [] for node_id in ray.state.node_ids(): future = block_until_http_ready.options( num_cpus=0, resources={ node_id: 0.01 }).remote("http://{}:{}/-/routes".format(http_host, http_port), timeout=HTTP_PROXY_TIMEOUT) futures.append(future) try: ray.get(futures) except ray.exceptions.RayTaskError: raise TimeoutError( "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.") return Client(controller, controller_name, detached=detached)
def test_serializing_exceptions(ray_start_regular_shared): with ray_start_client_server() as ray: with pytest.raises(ValueError, match="Failed to look up actor with name 'abc'"): ray.get_actor("abc")
def get_normal_task_pid(): try: pid_store_actor = ray.get_actor("pid-store", "test") return ray.get(pid_store_actor.get.remote()) except Exception: return None
def test_get_actor_no_input(ray_start_regular_shared): for bad_name in [None, "", " "]: with pytest.raises(ValueError): ray.get_actor(bad_name)
def process_incremental(sum, result): time.sleep(1) # Replace this with some processing code. return sum + result start_pipeline = timer() start = timer() '''Register Actors if not registered already''' flow1_actors = {} actor_names = ['flow1_actor1', 'flow1_actor2', 'flow1_actor3'] for actor_name in actor_names: try: flow1_actors[actor_name] = ray.get_actor(actor_name) print('Actor already registered: {}'.format(actor_name)) except ValueError: flow1_actors[actor_name] = Pipeline.options( name=actor_name, lifetime="detached").remote() flow1_actors[actor_name] print("duration =", timer() - start, " seconds for registering actors") ''' for actor_name in actor_names: flow1_actors[actor_name] = ray.get_actor(actor_name) ''' df = readtextfile.ReadTextFile( ipfile='/tmp/data/5m_Sales_Records.csv', ipschemafile=
def force_stop(self): """Force the actor to exit without shutting down gracefully.""" try: ray.kill(ray.get_actor(self._actor_name)) except ValueError: pass