Exemple #1
0
    def _put_serve_snapshot(self) -> None:
        val = dict()
        for deployment_name, (
                deployment_info,
                route_prefix,
        ) in self.list_deployments_internal(include_deleted=True).items():
            entry = dict()
            entry["name"] = deployment_name
            entry["namespace"] = ray.get_runtime_context().namespace
            entry["ray_job_id"] = deployment_info.deployer_job_id.hex()
            entry[
                "class_name"] = deployment_info.replica_config.deployment_def_name
            entry["version"] = deployment_info.version
            entry["http_route"] = route_prefix
            entry["start_time"] = deployment_info.start_time_ms
            entry["end_time"] = deployment_info.end_time_ms or 0
            entry[
                "status"] = "DELETED" if deployment_info.end_time_ms else "RUNNING"
            entry["actors"] = dict()
            if entry["status"] == "RUNNING":
                replicas = self.deployment_state_manager._deployment_states[
                    deployment_name]._replicas
                running_replicas = replicas.get([ReplicaState.RUNNING])
                for replica in running_replicas:
                    try:
                        actor_handle = replica.actor_handle
                    except ValueError:
                        # Actor died or hasn't yet been created.
                        continue
                    actor_id = actor_handle._ray_actor_id.hex()
                    replica_tag = replica.replica_tag
                    replica_version = (None if (replica.version is None
                                                or replica.version.unversioned)
                                       else replica.version.code_version)
                    entry["actors"][actor_id] = {
                        "replica_tag": replica_tag,
                        "version": replica_version,
                    }

            val[deployment_name] = entry
        self.snapshot_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))
Exemple #2
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
    ):
        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}"
        self.kv_store = make_kv_store(checkpoint_path, namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_current_actor_names,
        )

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemple #3
0
def test_serve_controller_namespace(ray_shutdown, namespace: Optional[str],
                                    detached: bool):
    """
    Tests the serve controller is started in the current namespace if not
    anonymous or in the "serve" namespace if no namespace is specified.
    When the controller is started in the "serve" namespace, this also tests
    that we can get the serve controller from another namespace.
    """

    ray.init(namespace=namespace)
    serve.start(detached=detached)
    client = serve.api._global_client
    if namespace:
        controller_namespace = namespace
    elif detached:
        controller_namespace = "serve"
    else:
        controller_namespace = ray.get_runtime_context().namespace

    assert ray.get_actor(client._controller_name,
                         namespace=controller_namespace)
Exemple #4
0
def connect_ray_pdb(host=None,
                    port=None,
                    patch_stdstreams=False,
                    quiet=None,
                    breakpoint_uuid=None):
    """
    Opens a remote PDB on first available port.
    """
    if host is None:
        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
    if port is None:
        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
    if quiet is None:
        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
    if not breakpoint_uuid:
        breakpoint_uuid = uuid.uuid4().hex
    rdb = RemotePdb(
        breakpoint_uuid=breakpoint_uuid,
        host=host,
        port=port,
        patch_stdstreams=patch_stdstreams,
        quiet=quiet)
    sockname = rdb._listen_socket.getsockname()
    pdb_address = "{}:{}".format(sockname[0], sockname[1])
    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
    data = {
        "proctitle": setproctitle.getproctitle(),
        "pdb_address": pdb_address,
        "filename": parentframeinfo.filename,
        "lineno": parentframeinfo.lineno,
        "traceback": "\n".join(traceback.format_exception(*sys.exc_info())),
        "timestamp": time.time(),
        "job_id": ray.get_runtime_context().job_id.hex(),
    }
    _internal_kv_put(
        "RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True)
    rdb.listen()
    _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid))

    return rdb
Exemple #5
0
def resume_async(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously.

    Resume a workflow and retrieve its output. If the workflow was incomplete,
    it will be re-executed from its checkpointed outputs. If the workflow was
    complete, returns the result immediately.

    Examples:
        >>> from ray import workflow
        >>> start_trip = ... # doctest: +SKIP
        >>> trip = start_trip.step() # doctest: +SKIP
        >>> res1 = trip.run_async(workflow_id="trip1") # doctest: +SKIP
        >>> res2 = workflow.resume("trip1") # doctest: +SKIP
        >>> assert ray.get(res1) == ray.get(res2) # doctest: +SKIP

    Args:
        workflow_id: The id of the workflow to resume.

    Returns:
        An object reference that can be used to retrieve the workflow result.
    """
    _ensure_workflow_initialized()
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = workflow_access.get_management_actor()
    if ray.get(
            workflow_manager.is_workflow_non_terminating.remote(workflow_id)):
        raise RuntimeError(
            f"Workflow '{workflow_id}' is already running or pending.")
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context))
    result = workflow_manager.execute_workflow.remote(job_id, context)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return result
Exemple #6
0
 def _put_serve_snapshot(self) -> None:
     val = dict()
     for deployment_name, (backend_info,
                           route_prefix) in self.list_deployments().items():
         entry = dict()
         entry["name"] = deployment_name
         entry["namespace"] = ray.get_runtime_context().namespace
         entry["ray_job_id"] = ("None"
                                if backend_info.deployer_job_id is None else
                                backend_info.deployer_job_id.hex())
         entry[
             "class_name"] = backend_info.replica_config.func_or_class_name
         entry["version"] = backend_info.version or "Unversioned"
         # TODO(architkulkarni): When we add the feature to allow
         # deployments with no HTTP route, update the below line.
         # Or refactor the route_prefix logic in the Deployment class now.
         entry["http_route"] = route_prefix or f"/{deployment_name}"
         entry["status"] = "RUNNING"
         entry["start_time"] = 0
         entry["end_time"] = 0
         val[deployment_name] = entry
     self.kv_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))
Exemple #7
0
 def ClusterInfo(self,
                 request,
                 context=None) -> ray_client_pb2.ClusterInfoResponse:
     resp = ray_client_pb2.ClusterInfoResponse()
     resp.type = request.type
     if request.type == ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES:
         with disable_client_hook():
             resources = ray.cluster_resources()
         # Normalize resources into floats
         # (the function may return values that are ints)
         float_resources = {k: float(v) for k, v in resources.items()}
         resp.resource_table.CopyFrom(
             ray_client_pb2.ClusterInfoResponse.ResourceTable(
                 table=float_resources))
     elif request.type == \
             ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES:
         with disable_client_hook():
             resources = ray.available_resources()
         # Normalize resources into floats
         # (the function may return values that are ints)
         float_resources = {k: float(v) for k, v in resources.items()}
         resp.resource_table.CopyFrom(
             ray_client_pb2.ClusterInfoResponse.ResourceTable(
                 table=float_resources))
     elif request.type == ray_client_pb2.ClusterInfoType.RUNTIME_CONTEXT:
         ctx = ray_client_pb2.ClusterInfoResponse.RuntimeContext()
         with disable_client_hook():
             rtc = ray.get_runtime_context()
             ctx.job_id = rtc.job_id.binary()
             ctx.node_id = rtc.node_id.binary()
             ctx.namespace = rtc.namespace
             ctx.capture_client_tasks = \
                 rtc.should_capture_child_tasks_in_placement_group
             ctx.runtime_env = json.dumps(rtc.runtime_env)
         resp.runtime_context.CopyFrom(ctx)
     else:
         with disable_client_hook():
             resp.json = self._return_debug_cluster_info(request, context)
     return resp
Exemple #8
0
 def __init__(
     self,
     controller_name: str,
     checkpoint_path: str,
     detached: bool = False,
     dedicated_cpu: bool = False,
     http_proxy_port: int = 8000,
 ):
     try:
         self._controller = ray.get_actor(controller_name, namespace="serve")
     except ValueError:
         self._controller = None
     if self._controller is None:
         # Used for scheduling things to the head node explicitly.
         head_node_id = ray.get_runtime_context().node_id.hex()
         http_config = HTTPOptions()
         http_config.port = http_proxy_port
         self._controller = ServeController.options(
             num_cpus=1 if dedicated_cpu else 0,
             name=controller_name,
             lifetime="detached" if detached else None,
             max_restarts=-1,
             max_task_retries=-1,
             # Schedule the controller on the head node with a soft constraint. This
             # prefers it to run on the head node in most cases, but allows it to be
             # restarted on other nodes in an HA cluster.
             scheduling_strategy=NodeAffinitySchedulingStrategy(
                 head_node_id, soft=True
             ),
             namespace="serve",
             max_concurrency=CONTROLLER_MAX_CONCURRENCY,
         ).remote(
             controller_name,
             http_config=http_config,
             checkpoint_path=checkpoint_path,
             head_node_id=head_node_id,
             detached=detached,
         )
Exemple #9
0
def get_controller_namespace(
        detached: bool, _override_controller_namespace: Optional[str] = None):
    """Gets the controller's namespace.

    Args:
        detached (bool): Whether serve.start() was called with detached=True
        _override_controller_namespace (Optional[str]): When set, this is the
            controller's namespace
    """

    if _override_controller_namespace is not None:
        return _override_controller_namespace

    controller_namespace = ray.get_runtime_context().namespace

    if not detached:
        return controller_namespace

    # Start controller in "serve" namespace if detached and currently
    # in anonymous namespace.
    if ANONYMOUS_NAMESPACE_PATTERN.fullmatch(controller_namespace) is not None:
        controller_namespace = "serve"
    return controller_namespace
Exemple #10
0
 def wait_signal(self):
     ray.get(signal.wait.remote())
     return ray.get_runtime_context()._get_actor_call_stats()
Exemple #11
0
 def __init__(self):
     self._was_reconstructed = ray.get_runtime_context(
     ).was_current_actor_reconstructed
Exemple #12
0
 def ping(self):
     return ray.get_runtime_context().node_id.hex()
Exemple #13
0
        def check_and_get_node_id(self):
            import test_module

            test_module.one()
            return ray.get_runtime_context().node_id
    def get_task_working_dir():
        # Check behavior of working_dir: The cwd should contain the
        # current file, which is being used as a job entrypoint script.
        assert os.path.exists("per_task_runtime_env.py")

        return ray.get_runtime_context().runtime_env.working_dir()
Exemple #15
0
 def h():
     ray.get(
         intentional_kill.remote(ray.get_runtime_context().current_actor))
     time.sleep(100)  # Don't return here to leave time for actor exit.
Exemple #16
0
 def task(node_id, job_id):
     context_dict = ray.get_runtime_context().get()
     assert context_dict["node_id"] == node_id
     assert context_dict["job_id"] == job_id
     assert context_dict["task_id"] is not None
     assert "actor_id" not in context_dict
Exemple #17
0
 def current_job_id(self):
     return ray.get_runtime_context().job_id
Exemple #18
0
 async def func(self):
     await signal.wait.remote()
     return ray.get_runtime_context()._get_actor_call_stats()
Exemple #19
0
 def update_was_reconstructed(self):
     return ray.get_runtime_context().was_current_actor_reconstructed
Exemple #20
0
 def get_env():
     return ray.get_runtime_context().runtime_env
Exemple #21
0
 def current_actor_id(self):
     return ray.get_runtime_context().actor_id
Exemple #22
0
 def get(self):
     return ray.get_runtime_context().runtime_env
Exemple #23
0
 def get_id(self):
     return ray.get_runtime_context().actor_id.hex()
Exemple #24
0
 def get_node_id():
     return ray.get_runtime_context().node_id
Exemple #25
0
 def get_address(self):
     if self.ray_version >= StrictVersion('1.0.0'):
         return ray.get_runtime_context().worker.node_ip_address
     else:
         return ray.services.get_node_ip_address()
Exemple #26
0
 def echo(self, s):
     self_actor = ray.get_runtime_context().current_actor
     return self_actor.echo2.remote(s)
Exemple #27
0
def test_runtime_context(shutdown_only):
    ray.init(namespace="abc")
    namespace = ray.get_runtime_context().namespace
    assert namespace == "abc"
    assert namespace == ray.get_runtime_context().get()["namespace"]
Exemple #28
0
 def func():
     return ray.get_runtime_context()._get_actor_call_stats()
Exemple #29
0
    def deploy(
            self,
            name: str,
            deployment_def: Union[Callable, Type[Callable], str],
            init_args: Tuple[Any],
            init_kwargs: Dict[Any, Any],
            ray_actor_options: Optional[Dict] = None,
            config: Optional[Union[DeploymentConfig, Dict[str, Any]]] = None,
            version: Optional[str] = None,
            prev_version: Optional[str] = None,
            route_prefix: Optional[str] = None,
            url: Optional[str] = None,
            _blocking: Optional[bool] = True) -> Optional[GoalId]:
        if config is None:
            config = {}
        if ray_actor_options is None:
            ray_actor_options = {}

        curr_job_env = ray.get_runtime_context().runtime_env
        if "runtime_env" in ray_actor_options:
            ray_actor_options["runtime_env"].setdefault(
                "working_dir", curr_job_env.get("working_dir"))
        else:
            ray_actor_options["runtime_env"] = curr_job_env

        replica_config = ReplicaConfig(
            deployment_def,
            init_args=init_args,
            init_kwargs=init_kwargs,
            ray_actor_options=ray_actor_options)

        if isinstance(config, dict):
            deployment_config = DeploymentConfig.parse_obj(config)
        elif isinstance(config, DeploymentConfig):
            deployment_config = config
        else:
            raise TypeError(
                "config must be a DeploymentConfig or a dictionary.")

        if deployment_config.autoscaling_config is not None and \
            deployment_config.max_concurrent_queries < deployment_config. \
                autoscaling_config.target_num_ongoing_requests_per_replica:
            logger.warning("Autoscaling will never happen, "
                           "because 'max_concurrent_queries' is less than "
                           "'target_num_ongoing_requests_per_replica' now.")

        goal_id, updating = ray.get(
            self._controller.deploy.remote(name,
                                           deployment_config.to_proto_bytes(),
                                           replica_config, version,
                                           prev_version, route_prefix,
                                           ray.get_runtime_context().job_id))

        tag = f"component=serve deployment={name}"

        if updating:
            msg = f"Updating deployment '{name}'"
            if version is not None:
                msg += f" to version '{version}'"
            logger.info(f"{msg}. {tag}")
        else:
            logger.info(f"Deployment '{name}' is already at version "
                        f"'{version}', not updating. {tag}")

        if _blocking:
            self._wait_for_goal(goal_id)

            if url is not None:
                url_part = f" at `{url}`"
            else:
                url_part = ""
            logger.info(
                f"Deployment '{name}{':'+version if version else ''}' is ready"
                f"{url_part}. {tag}")
        else:
            return goal_id
Exemple #30
0
 def run(self):
     return ray.get_runtime_context()._get_actor_call_stats()