def _put_serve_snapshot(self) -> None: val = dict() for deployment_name, ( deployment_info, route_prefix, ) in self.list_deployments_internal(include_deleted=True).items(): entry = dict() entry["name"] = deployment_name entry["namespace"] = ray.get_runtime_context().namespace entry["ray_job_id"] = deployment_info.deployer_job_id.hex() entry[ "class_name"] = deployment_info.replica_config.deployment_def_name entry["version"] = deployment_info.version entry["http_route"] = route_prefix entry["start_time"] = deployment_info.start_time_ms entry["end_time"] = deployment_info.end_time_ms or 0 entry[ "status"] = "DELETED" if deployment_info.end_time_ms else "RUNNING" entry["actors"] = dict() if entry["status"] == "RUNNING": replicas = self.deployment_state_manager._deployment_states[ deployment_name]._replicas running_replicas = replicas.get([ReplicaState.RUNNING]) for replica in running_replicas: try: actor_handle = replica.actor_handle except ValueError: # Actor died or hasn't yet been created. continue actor_id = actor_handle._ray_actor_id.hex() replica_tag = replica.replica_tag replica_version = (None if (replica.version is None or replica.version.unversioned) else replica.version.code_version) entry["actors"][actor_id] = { "replica_tag": replica_tag, "version": replica_version, } val[deployment_name] = entry self.snapshot_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))
async def __init__( self, controller_name: str, http_config: HTTPOptions, checkpoint_path: str, detached: bool = False, ): # Used to read/write checkpoints. self.controller_namespace = ray.get_runtime_context().namespace self.controller_name = controller_name self.checkpoint_path = checkpoint_path kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}" self.kv_store = make_kv_store(checkpoint_path, namespace=kv_store_namespace) self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace) # Dictionary of deployment_name -> proxy_name -> queue length. self.deployment_stats = defaultdict(lambda: defaultdict(dict)) # Used to ensure that only a single state-changing operation happens # at any given time. self.write_lock = asyncio.Lock() self.long_poll_host = LongPollHost() self.http_state = HTTPState(controller_name, detached, http_config) self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host) # Fetch all running actors in current cluster as source of current # replica state for controller failure recovery all_current_actor_names = ray.util.list_named_actors() self.deployment_state_manager = DeploymentStateManager( controller_name, detached, self.kv_store, self.long_poll_host, all_current_actor_names, ) # TODO(simon): move autoscaling related stuff into a manager. self.autoscaling_metrics_store = InMemoryMetricsStore() asyncio.get_event_loop().create_task(self.run_control_loop())
def test_serve_controller_namespace(ray_shutdown, namespace: Optional[str], detached: bool): """ Tests the serve controller is started in the current namespace if not anonymous or in the "serve" namespace if no namespace is specified. When the controller is started in the "serve" namespace, this also tests that we can get the serve controller from another namespace. """ ray.init(namespace=namespace) serve.start(detached=detached) client = serve.api._global_client if namespace: controller_namespace = namespace elif detached: controller_namespace = "serve" else: controller_namespace = ray.get_runtime_context().namespace assert ray.get_actor(client._controller_name, namespace=controller_namespace)
def connect_ray_pdb(host=None, port=None, patch_stdstreams=False, quiet=None, breakpoint_uuid=None): """ Opens a remote PDB on first available port. """ if host is None: host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1") if port is None: port = int(os.environ.get("REMOTE_PDB_PORT", "0")) if quiet is None: quiet = bool(os.environ.get("REMOTE_PDB_QUIET", "")) if not breakpoint_uuid: breakpoint_uuid = uuid.uuid4().hex rdb = RemotePdb( breakpoint_uuid=breakpoint_uuid, host=host, port=port, patch_stdstreams=patch_stdstreams, quiet=quiet) sockname = rdb._listen_socket.getsockname() pdb_address = "{}:{}".format(sockname[0], sockname[1]) parentframeinfo = inspect.getouterframes(inspect.currentframe())[2] data = { "proctitle": setproctitle.getproctitle(), "pdb_address": pdb_address, "filename": parentframeinfo.filename, "lineno": parentframeinfo.lineno, "traceback": "\n".join(traceback.format_exception(*sys.exc_info())), "timestamp": time.time(), "job_id": ray.get_runtime_context().job_id.hex(), } _internal_kv_put( "RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True) rdb.listen() _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid)) return rdb
def resume_async(workflow_id: str) -> ray.ObjectRef: """Resume a workflow asynchronously. Resume a workflow and retrieve its output. If the workflow was incomplete, it will be re-executed from its checkpointed outputs. If the workflow was complete, returns the result immediately. Examples: >>> from ray import workflow >>> start_trip = ... # doctest: +SKIP >>> trip = start_trip.step() # doctest: +SKIP >>> res1 = trip.run_async(workflow_id="trip1") # doctest: +SKIP >>> res2 = workflow.resume("trip1") # doctest: +SKIP >>> assert ray.get(res1) == ray.get(res2) # doctest: +SKIP Args: workflow_id: The id of the workflow to resume. Returns: An object reference that can be used to retrieve the workflow result. """ _ensure_workflow_initialized() logger.info(f'Resuming workflow [id="{workflow_id}"].') workflow_manager = workflow_access.get_management_actor() if ray.get( workflow_manager.is_workflow_non_terminating.remote(workflow_id)): raise RuntimeError( f"Workflow '{workflow_id}' is already running or pending.") # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. job_id = ray.get_runtime_context().job_id.hex() context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context)) result = workflow_manager.execute_workflow.remote(job_id, context) logger.info(f"Workflow job {workflow_id} resumed.") return result
def _put_serve_snapshot(self) -> None: val = dict() for deployment_name, (backend_info, route_prefix) in self.list_deployments().items(): entry = dict() entry["name"] = deployment_name entry["namespace"] = ray.get_runtime_context().namespace entry["ray_job_id"] = ("None" if backend_info.deployer_job_id is None else backend_info.deployer_job_id.hex()) entry[ "class_name"] = backend_info.replica_config.func_or_class_name entry["version"] = backend_info.version or "Unversioned" # TODO(architkulkarni): When we add the feature to allow # deployments with no HTTP route, update the below line. # Or refactor the route_prefix logic in the Deployment class now. entry["http_route"] = route_prefix or f"/{deployment_name}" entry["status"] = "RUNNING" entry["start_time"] = 0 entry["end_time"] = 0 val[deployment_name] = entry self.kv_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))
def ClusterInfo(self, request, context=None) -> ray_client_pb2.ClusterInfoResponse: resp = ray_client_pb2.ClusterInfoResponse() resp.type = request.type if request.type == ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES: with disable_client_hook(): resources = ray.cluster_resources() # Normalize resources into floats # (the function may return values that are ints) float_resources = {k: float(v) for k, v in resources.items()} resp.resource_table.CopyFrom( ray_client_pb2.ClusterInfoResponse.ResourceTable( table=float_resources)) elif request.type == \ ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES: with disable_client_hook(): resources = ray.available_resources() # Normalize resources into floats # (the function may return values that are ints) float_resources = {k: float(v) for k, v in resources.items()} resp.resource_table.CopyFrom( ray_client_pb2.ClusterInfoResponse.ResourceTable( table=float_resources)) elif request.type == ray_client_pb2.ClusterInfoType.RUNTIME_CONTEXT: ctx = ray_client_pb2.ClusterInfoResponse.RuntimeContext() with disable_client_hook(): rtc = ray.get_runtime_context() ctx.job_id = rtc.job_id.binary() ctx.node_id = rtc.node_id.binary() ctx.namespace = rtc.namespace ctx.capture_client_tasks = \ rtc.should_capture_child_tasks_in_placement_group ctx.runtime_env = json.dumps(rtc.runtime_env) resp.runtime_context.CopyFrom(ctx) else: with disable_client_hook(): resp.json = self._return_debug_cluster_info(request, context) return resp
def __init__( self, controller_name: str, checkpoint_path: str, detached: bool = False, dedicated_cpu: bool = False, http_proxy_port: int = 8000, ): try: self._controller = ray.get_actor(controller_name, namespace="serve") except ValueError: self._controller = None if self._controller is None: # Used for scheduling things to the head node explicitly. head_node_id = ray.get_runtime_context().node_id.hex() http_config = HTTPOptions() http_config.port = http_proxy_port self._controller = ServeController.options( num_cpus=1 if dedicated_cpu else 0, name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, # Schedule the controller on the head node with a soft constraint. This # prefers it to run on the head node in most cases, but allows it to be # restarted on other nodes in an HA cluster. scheduling_strategy=NodeAffinitySchedulingStrategy( head_node_id, soft=True ), namespace="serve", max_concurrency=CONTROLLER_MAX_CONCURRENCY, ).remote( controller_name, http_config=http_config, checkpoint_path=checkpoint_path, head_node_id=head_node_id, detached=detached, )
def get_controller_namespace( detached: bool, _override_controller_namespace: Optional[str] = None): """Gets the controller's namespace. Args: detached (bool): Whether serve.start() was called with detached=True _override_controller_namespace (Optional[str]): When set, this is the controller's namespace """ if _override_controller_namespace is not None: return _override_controller_namespace controller_namespace = ray.get_runtime_context().namespace if not detached: return controller_namespace # Start controller in "serve" namespace if detached and currently # in anonymous namespace. if ANONYMOUS_NAMESPACE_PATTERN.fullmatch(controller_namespace) is not None: controller_namespace = "serve" return controller_namespace
def wait_signal(self): ray.get(signal.wait.remote()) return ray.get_runtime_context()._get_actor_call_stats()
def __init__(self): self._was_reconstructed = ray.get_runtime_context( ).was_current_actor_reconstructed
def ping(self): return ray.get_runtime_context().node_id.hex()
def check_and_get_node_id(self): import test_module test_module.one() return ray.get_runtime_context().node_id
def get_task_working_dir(): # Check behavior of working_dir: The cwd should contain the # current file, which is being used as a job entrypoint script. assert os.path.exists("per_task_runtime_env.py") return ray.get_runtime_context().runtime_env.working_dir()
def h(): ray.get( intentional_kill.remote(ray.get_runtime_context().current_actor)) time.sleep(100) # Don't return here to leave time for actor exit.
def task(node_id, job_id): context_dict = ray.get_runtime_context().get() assert context_dict["node_id"] == node_id assert context_dict["job_id"] == job_id assert context_dict["task_id"] is not None assert "actor_id" not in context_dict
def current_job_id(self): return ray.get_runtime_context().job_id
async def func(self): await signal.wait.remote() return ray.get_runtime_context()._get_actor_call_stats()
def update_was_reconstructed(self): return ray.get_runtime_context().was_current_actor_reconstructed
def get_env(): return ray.get_runtime_context().runtime_env
def current_actor_id(self): return ray.get_runtime_context().actor_id
def get(self): return ray.get_runtime_context().runtime_env
def get_id(self): return ray.get_runtime_context().actor_id.hex()
def get_node_id(): return ray.get_runtime_context().node_id
def get_address(self): if self.ray_version >= StrictVersion('1.0.0'): return ray.get_runtime_context().worker.node_ip_address else: return ray.services.get_node_ip_address()
def echo(self, s): self_actor = ray.get_runtime_context().current_actor return self_actor.echo2.remote(s)
def test_runtime_context(shutdown_only): ray.init(namespace="abc") namespace = ray.get_runtime_context().namespace assert namespace == "abc" assert namespace == ray.get_runtime_context().get()["namespace"]
def func(): return ray.get_runtime_context()._get_actor_call_stats()
def deploy( self, name: str, deployment_def: Union[Callable, Type[Callable], str], init_args: Tuple[Any], init_kwargs: Dict[Any, Any], ray_actor_options: Optional[Dict] = None, config: Optional[Union[DeploymentConfig, Dict[str, Any]]] = None, version: Optional[str] = None, prev_version: Optional[str] = None, route_prefix: Optional[str] = None, url: Optional[str] = None, _blocking: Optional[bool] = True) -> Optional[GoalId]: if config is None: config = {} if ray_actor_options is None: ray_actor_options = {} curr_job_env = ray.get_runtime_context().runtime_env if "runtime_env" in ray_actor_options: ray_actor_options["runtime_env"].setdefault( "working_dir", curr_job_env.get("working_dir")) else: ray_actor_options["runtime_env"] = curr_job_env replica_config = ReplicaConfig( deployment_def, init_args=init_args, init_kwargs=init_kwargs, ray_actor_options=ray_actor_options) if isinstance(config, dict): deployment_config = DeploymentConfig.parse_obj(config) elif isinstance(config, DeploymentConfig): deployment_config = config else: raise TypeError( "config must be a DeploymentConfig or a dictionary.") if deployment_config.autoscaling_config is not None and \ deployment_config.max_concurrent_queries < deployment_config. \ autoscaling_config.target_num_ongoing_requests_per_replica: logger.warning("Autoscaling will never happen, " "because 'max_concurrent_queries' is less than " "'target_num_ongoing_requests_per_replica' now.") goal_id, updating = ray.get( self._controller.deploy.remote(name, deployment_config.to_proto_bytes(), replica_config, version, prev_version, route_prefix, ray.get_runtime_context().job_id)) tag = f"component=serve deployment={name}" if updating: msg = f"Updating deployment '{name}'" if version is not None: msg += f" to version '{version}'" logger.info(f"{msg}. {tag}") else: logger.info(f"Deployment '{name}' is already at version " f"'{version}', not updating. {tag}") if _blocking: self._wait_for_goal(goal_id) if url is not None: url_part = f" at `{url}`" else: url_part = "" logger.info( f"Deployment '{name}{':'+version if version else ''}' is ready" f"{url_part}. {tag}") else: return goal_id
def run(self): return ray.get_runtime_context()._get_actor_call_stats()