Beispiel #1
0
    def start_specific_server(self, client_id: str,
                              job_config: JobConfig) -> bool:
        """
        Start up a RayClient Server for an incoming client to
        communicate with. Returns whether creation was successful.
        """
        specific_server = self._get_server_for_client(client_id)
        assert specific_server, f"Server has not been created for: {client_id}"

        output, error = self.node.get_log_file_handles(
            f"ray_client_server_{specific_server.port}", unique=True)

        serialized_runtime_env = job_config.get_serialized_runtime_env()
        runtime_env_config = job_config.get_proto_runtime_env_config()
        if not serialized_runtime_env or serialized_runtime_env == "{}":
            # TODO(edoakes): can we just remove this case and always send it
            # to the agent?
            serialized_runtime_env_context = RuntimeEnvContext().serialize()
        else:
            serialized_runtime_env_context = self._create_runtime_env(
                serialized_runtime_env=serialized_runtime_env,
                runtime_env_config=runtime_env_config,
                specific_server=specific_server,
            )

        proc = start_ray_client_server(
            self.address,
            self.node.node_ip_address,
            specific_server.port,
            stdout_file=output,
            stderr_file=error,
            fate_share=self.fate_share,
            server_type="specific-server",
            serialized_runtime_env_context=serialized_runtime_env_context,
            redis_password=self._redis_password,
        )

        # Wait for the process being run transitions from the shim process
        # to the actual RayClient Server.
        pid = proc.process.pid
        if sys.platform != "win32":
            psutil_proc = psutil.Process(pid)
        else:
            psutil_proc = None
        # Don't use `psutil` on Win32
        while psutil_proc is not None:
            if proc.process.poll() is not None:
                logger.error(
                    f"SpecificServer startup failed for client: {client_id}")
                break
            cmd = psutil_proc.cmdline()
            if _match_running_client_server(cmd):
                break
            logger.debug(
                "Waiting for Process to reach the actual client server.")
            time.sleep(0.5)
        specific_server.set_result(proc)
        logger.info(f"SpecificServer started on port: {specific_server.port} "
                    f"with PID: {pid} for client: {client_id}")
        return proc.process.poll() is None
Beispiel #2
0
            def run_setup_with_logger():
                runtime_env: dict = json.loads(serialized_runtime_env or "{}")
                allocated_resource: dict = json.loads(
                    serialized_allocated_resource_instances or "{}")

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                # TODO(chenk008): Add log about allocated_resource to
                # avoid lint error. That will be moved to cgroup plugin.
                per_job_logger.debug(f"Worker has resource :"
                                     f"{allocated_resource}")
                context = RuntimeEnvContext(
                    env_vars=runtime_env.get("env_vars"))
                self._conda_manager.setup(
                    runtime_env, context, logger=per_job_logger)
                self._working_dir_manager.setup(
                    runtime_env, context, logger=per_job_logger)

                # Add the mapping of URIs -> the serialized environment to be
                # used for cache invalidation.
                for uri in runtime_env.get("uris") or []:
                    self._working_dir_uri_to_envs[uri].add(
                        serialized_runtime_env)

                # Run setup function from all the plugins
                for plugin_class_path in runtime_env.get("plugins", {}).keys():
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create("uri not implemented", runtime_env,
                                        context)
                    plugin_class.modify_context("uri not implemented",
                                                runtime_env, context)

                return context
Beispiel #3
0
def test_inherit_cluster_env_pythonpath(monkeypatch):
    monkeypatch.setenv("PYTHONPATH",
                       "last" + os.pathsep + os.environ.get("PYTHONPATH", ""))
    context = RuntimeEnvContext(env_vars={"PYTHONPATH": "middle"})

    set_pythonpath_in_context("first", context)

    assert context.env_vars["PYTHONPATH"].startswith(
        os.pathsep.join(["first", "middle", "last"]))
Beispiel #4
0
            def run_setup_with_logger():
                runtime_env = RuntimeEnv(
                    serialized_runtime_env=serialized_runtime_env)
                allocated_resource: dict = json.loads(
                    serialized_allocated_resource_instances or "{}")

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                # TODO(chenk008): Add log about allocated_resource to
                # avoid lint error. That will be moved to cgroup plugin.
                per_job_logger.debug(f"Worker has resource :"
                                     f"{allocated_resource}")
                context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
                self._pip_manager.setup(
                    runtime_env, context, logger=per_job_logger)
                self._conda_manager.setup(
                    runtime_env, context, logger=per_job_logger)
                self._py_modules_manager.setup(
                    runtime_env, context, logger=per_job_logger)
                self._working_dir_manager.setup(
                    runtime_env, context, logger=per_job_logger)
                self._container_manager.setup(
                    runtime_env, context, logger=per_job_logger)

                # Add the mapping of URIs -> the serialized environment to be
                # used for cache invalidation.
                if runtime_env.working_dir_uri():
                    uri = runtime_env.working_dir_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.py_modules_uris():
                    for uri in runtime_env.py_modules_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.conda_uri():
                    uri = runtime_env.conda_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.pip_uri():
                    uri = runtime_env.pip_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.plugin_uris():
                    for uri in runtime_env.plugin_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)

                # Run setup function from all the plugins
                for plugin_class_path, config in runtime_env.plugins():
                    logger.debug(
                        f"Setting up runtime env plugin {plugin_class_path}")
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create("uri not implemented",
                                        json.loads(config), context)
                    plugin_class.modify_context("uri not implemented",
                                                json.loads(config), context)

                return context
Beispiel #5
0
        async def _setup_runtime_env(runtime_env, serialized_runtime_env,
                                     serialized_allocated_resource_instances):
            allocated_resource: dict = json.loads(
                serialized_allocated_resource_instances or "{}")
            # Use a separate logger for each job.
            per_job_logger = self.get_or_create_logger(request.job_id)
            # TODO(chenk008): Add log about allocated_resource to
            # avoid lint error. That will be moved to cgroup plugin.
            per_job_logger.debug(f"Worker has resource :"
                                 f"{allocated_resource}")
            context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
            await self._container_manager.setup(runtime_env,
                                                context,
                                                logger=per_job_logger)

            for manager in self._base_plugin_cache_managers.values():
                await manager.create_if_needed(runtime_env,
                                               context,
                                               logger=per_job_logger)

            def setup_plugins():
                # Run setup function from all the plugins
                if runtime_env.plugins():
                    for (
                            setup_context
                    ) in self._runtime_env_plugin_manager.sorted_plugin_setup_contexts(
                            runtime_env.plugins()):
                        per_job_logger.debug(
                            f"Setting up runtime env plugin {setup_context.name}"
                        )
                        # TODO(architkulkarni): implement uri support
                        setup_context.class_instance.validate(runtime_env)
                        setup_context.class_instance.create(
                            "uri not implemented", setup_context.config,
                            context)
                        setup_context.class_instance.modify_context(
                            "uri not implemented",
                            setup_context.config,
                            context,
                            per_job_logger,
                        )

            loop = asyncio.get_event_loop()
            # Plugins setup method is sync process, running in other threads
            # is to avoid blocking asyncio loop
            await loop.run_in_executor(None, setup_plugins)

            return context
async def test_create_delete_size_equal(tmpdir, ray_start_regular):
    """Tests that `create` and `delete_uri` return the same size for a URI."""

    # Create an arbitrary nonempty directory to upload.
    path = Path(tmpdir)
    dir_to_upload = path / "dir_to_upload"
    dir_to_upload.mkdir(parents=True)
    filepath = dir_to_upload / "file"
    with filepath.open("w") as file:
        file.write("F" * 100)

    uri = get_uri_for_directory(dir_to_upload)
    assert get_directory_size_bytes(dir_to_upload) > 0

    uploaded = upload_package_if_needed(uri, tmpdir, dir_to_upload)
    assert uploaded

    manager = WorkingDirManager(tmpdir)

    created_size_bytes = await manager.create(uri, {}, RuntimeEnvContext())
    deleted_size_bytes = manager.delete_uri(uri)
    assert created_size_bytes == deleted_size_bytes
Beispiel #7
0
            def run_setup_with_logger():
                runtime_env = RuntimeEnv(serialized_runtime_env=serialized_runtime_env)
                allocated_resource: dict = json.loads(
                    serialized_allocated_resource_instances or "{}"
                )

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                # TODO(chenk008): Add log about allocated_resource to
                # avoid lint error. That will be moved to cgroup plugin.
                per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}")
                context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
                self._container_manager.setup(
                    runtime_env, context, logger=per_job_logger
                )

                for (manager, uri_cache) in [
                    (self._working_dir_manager, self._working_dir_uri_cache),
                    (self._conda_manager, self._conda_uri_cache),
                    (self._pip_manager, self._pip_uri_cache),
                ]:
                    uri = manager.get_uri(runtime_env)
                    if uri is not None:
                        if uri not in uri_cache:
                            per_job_logger.debug(f"Cache miss for URI {uri}.")
                            size_bytes = manager.create(
                                uri, runtime_env, context, logger=per_job_logger
                            )
                            uri_cache.add(uri, size_bytes, logger=per_job_logger)
                        else:
                            per_job_logger.debug(f"Cache hit for URI {uri}.")
                            uri_cache.mark_used(uri, logger=per_job_logger)
                    manager.modify_context(uri, runtime_env, context)

                # Set up py_modules. For now, py_modules uses multiple URIs so
                # the logic is slightly different from working_dir, conda, and
                # pip above.
                py_modules_uris = self._py_modules_manager.get_uris(runtime_env)
                if py_modules_uris is not None:
                    for uri in py_modules_uris:
                        if uri not in self._py_modules_uri_cache:
                            per_job_logger.debug(f"Cache miss for URI {uri}.")
                            size_bytes = self._py_modules_manager.create(
                                uri, runtime_env, context, logger=per_job_logger
                            )
                            self._py_modules_uri_cache.add(
                                uri, size_bytes, logger=per_job_logger
                            )
                        else:
                            per_job_logger.debug(f"Cache hit for URI {uri}.")
                            self._py_modules_uri_cache.mark_used(
                                uri, logger=per_job_logger
                            )
                self._py_modules_manager.modify_context(
                    py_modules_uris, runtime_env, context
                )

                # Add the mapping of URIs -> the serialized environment to be
                # used for cache invalidation.
                if runtime_env.working_dir_uri():
                    uri = runtime_env.working_dir_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.py_modules_uris():
                    for uri in runtime_env.py_modules_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.conda_uri():
                    uri = runtime_env.conda_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.pip_uri():
                    uri = runtime_env.pip_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.plugin_uris():
                    for uri in runtime_env.plugin_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)

                # Run setup function from all the plugins
                for plugin_class_path, config in runtime_env.plugins():
                    per_job_logger.debug(
                        f"Setting up runtime env plugin {plugin_class_path}"
                    )
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create(
                        "uri not implemented", json.loads(config), context
                    )
                    plugin_class.modify_context(
                        "uri not implemented", json.loads(config), context
                    )

                return context
Beispiel #8
0
        async def _setup_runtime_env(runtime_env, serialized_runtime_env,
                                     serialized_allocated_resource_instances):
            allocated_resource: dict = json.loads(
                serialized_allocated_resource_instances or "{}")
            # Use a separate logger for each job.
            per_job_logger = self.get_or_create_logger(request.job_id)
            # TODO(chenk008): Add log about allocated_resource to
            # avoid lint error. That will be moved to cgroup plugin.
            per_job_logger.debug(f"Worker has resource :"
                                 f"{allocated_resource}")
            context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
            await self._container_manager.setup(runtime_env,
                                                context,
                                                logger=per_job_logger)

            for (manager, uri_cache) in [
                (self._working_dir_manager, self._working_dir_uri_cache),
                (self._conda_manager, self._conda_uri_cache),
                (self._pip_manager, self._pip_uri_cache),
            ]:
                uri = manager.get_uri(runtime_env)
                if uri is not None:
                    if uri not in uri_cache:
                        per_job_logger.debug(f"Cache miss for URI {uri}.")
                        size_bytes = await manager.create(
                            uri, runtime_env, context, logger=per_job_logger)
                        uri_cache.add(uri, size_bytes, logger=per_job_logger)
                    else:
                        per_job_logger.debug(f"Cache hit for URI {uri}.")
                        uri_cache.mark_used(uri, logger=per_job_logger)
                manager.modify_context(uri, runtime_env, context)

            # Set up py_modules. For now, py_modules uses multiple URIs so
            # the logic is slightly different from working_dir, conda, and
            # pip above.
            for (manager, uri_cache) in [
                (self._java_jars_manager, self._java_jars_uri_cache),
                (self._py_modules_manager, self._py_modules_uri_cache),
            ]:
                uris = manager.get_uris(runtime_env)
                if uris is not None:
                    per_job_logger.debug(f"URIs is not None, URI {uris}.")
                    for uri in uris:
                        if uri not in uri_cache:
                            per_job_logger.debug(f"Cache miss for URI {uri}.")
                            size_bytes = await manager.create(
                                uri,
                                runtime_env,
                                context,
                                logger=per_job_logger)
                            uri_cache.add(uri,
                                          size_bytes,
                                          logger=per_job_logger)
                        else:
                            per_job_logger.debug(f"Cache hit for URI {uri}.")
                            uri_cache.mark_used(uri, logger=per_job_logger)
                manager.modify_context(uris, runtime_env, context)

            def setup_plugins():
                # Run setup function from all the plugins
                for plugin_class_path, config in runtime_env.plugins():
                    per_job_logger.debug(
                        f"Setting up runtime env plugin {plugin_class_path}")
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create("uri not implemented",
                                        json.loads(config), context)
                    plugin_class.modify_context("uri not implemented",
                                                json.loads(config), context)

            loop = asyncio.get_event_loop()
            # Plugins setup method is sync process, running in other threads
            # is to avoid  blocks asyncio loop
            await loop.run_in_executor(None, setup_plugins)

            return context