Beispiel #1
0
    async def create(
        self,
        uri: str,
        runtime_env: "RuntimeEnv",  # noqa: F821
        context: RuntimeEnvContext,
        logger: Optional[logging.Logger] = default_logger,
    ) -> int:
        if not runtime_env.has_pip():
            return 0

        protocol, hash = parse_uri(uri)
        target_dir = self._get_path_from_hash(hash)

        async def _create_for_hash():
            await PipProcessor(
                target_dir,
                runtime_env,
                logger,
            )

            loop = get_running_loop()
            return await loop.run_in_executor(None, get_directory_size_bytes,
                                              target_dir)

        if uri not in self._create_locks:
            # async lock to prevent the same virtualenv being concurrently installed
            self._create_locks[uri] = asyncio.Lock()

        async with self._create_locks[uri]:
            self._creating_task[hash] = task = create_task(_create_for_hash())
            task.add_done_callback(
                lambda _: self._creating_task.pop(hash, None))
            return await task
Beispiel #2
0
    async def create(
        self,
        uri: str,
        runtime_env: "RuntimeEnv",  # noqa: F821
        context: RuntimeEnvContext,
        logger: Optional[logging.Logger] = default_logger,
    ) -> int:
        if not runtime_env.has_pip():
            return 0

        protocol, hash = parse_uri(uri)
        target_dir = self._get_path_from_hash(hash)

        async def _create_for_hash():
            await PipProcessor(target_dir, runtime_env, logger)

            loop = get_running_loop()
            return await loop.run_in_executor(None, get_directory_size_bytes,
                                              target_dir)

        self._creating_task[hash] = task = create_task(_create_for_hash())
        task.add_done_callback(lambda _: self._creating_task.pop(hash, None))
        return await task
Beispiel #3
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(
            serialized_runtime_env, serialized_allocated_resource_instances
        ):
            runtime_env = RuntimeEnv.deserialize(serialized_runtime_env)
            allocated_resource: dict = json.loads(
                serialized_allocated_resource_instances or "{}"
            )

            # Use a separate logger for each job.
            per_job_logger = self.get_or_create_logger(request.job_id)
            # TODO(chenk008): Add log about allocated_resource to
            # avoid lint error. That will be moved to cgroup plugin.
            per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}")
            context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
            await self._container_manager.setup(
                runtime_env, context, logger=per_job_logger
            )

            for (manager, uri_cache) in [
                (self._working_dir_manager, self._working_dir_uri_cache),
                (self._conda_manager, self._conda_uri_cache),
                (self._pip_manager, self._pip_uri_cache),
            ]:
                uri = manager.get_uri(runtime_env)
                if uri is not None:
                    if uri not in uri_cache:
                        per_job_logger.debug(f"Cache miss for URI {uri}.")
                        size_bytes = await manager.create(
                            uri, runtime_env, context, logger=per_job_logger
                        )
                        uri_cache.add(uri, size_bytes, logger=per_job_logger)
                    else:
                        per_job_logger.debug(f"Cache hit for URI {uri}.")
                        uri_cache.mark_used(uri, logger=per_job_logger)
                manager.modify_context(uri, runtime_env, context)

            # Set up py_modules. For now, py_modules uses multiple URIs so
            # the logic is slightly different from working_dir, conda, and
            # pip above.
            py_modules_uris = self._py_modules_manager.get_uris(runtime_env)
            if py_modules_uris is not None:
                for uri in py_modules_uris:
                    if uri not in self._py_modules_uri_cache:
                        per_job_logger.debug(f"Cache miss for URI {uri}.")
                        size_bytes = await self._py_modules_manager.create(
                            uri, runtime_env, context, logger=per_job_logger
                        )
                        self._py_modules_uri_cache.add(
                            uri, size_bytes, logger=per_job_logger
                        )
                    else:
                        per_job_logger.debug(f"Cache hit for URI {uri}.")
                        self._py_modules_uri_cache.mark_used(uri, logger=per_job_logger)
            self._py_modules_manager.modify_context(
                py_modules_uris, runtime_env, context
            )

            # Add the mapping of URIs -> the serialized environment to be
            # used for cache invalidation.
            if runtime_env.working_dir_uri():
                uri = runtime_env.working_dir_uri()
                self._uris_to_envs[uri].add(serialized_runtime_env)
            if runtime_env.py_modules_uris():
                for uri in runtime_env.py_modules_uris():
                    self._uris_to_envs[uri].add(serialized_runtime_env)
            if runtime_env.conda_uri():
                uri = runtime_env.conda_uri()
                self._uris_to_envs[uri].add(serialized_runtime_env)
            if runtime_env.pip_uri():
                uri = runtime_env.pip_uri()
                self._uris_to_envs[uri].add(serialized_runtime_env)
            if runtime_env.plugin_uris():
                for uri in runtime_env.plugin_uris():
                    self._uris_to_envs[uri].add(serialized_runtime_env)

            def setup_plugins():
                # Run setup function from all the plugins
                for plugin_class_path, config in runtime_env.plugins():
                    per_job_logger.debug(
                        f"Setting up runtime env plugin {plugin_class_path}"
                    )
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create(
                        "uri not implemented", json.loads(config), context
                    )
                    plugin_class.modify_context(
                        "uri not implemented", json.loads(config), context
                    )

            loop = asyncio.get_event_loop()
            # Plugins setup method is sync process, running in other threads
            # is to avoid  blocks asyncio loop
            await loop.run_in_executor(None, setup_plugins)

            return context

        serialized_env = request.serialized_runtime_env
        runtime_env_config = request.runtime_env_config

        if serialized_env not in self._env_locks:
            # async lock to prevent the same env being concurrently installed
            self._env_locks[serialized_env] = asyncio.Lock()

        async with self._env_locks[serialized_env]:
            if serialized_env in self._env_cache:
                serialized_context = self._env_cache[serialized_env]
                result = self._env_cache[serialized_env]
                if result.success:
                    context = result.result
                    self._logger.info(
                        "Runtime env already created "
                        f"successfully. Env: {serialized_env}, "
                        f"context: {context}"
                    )
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                        serialized_runtime_env_context=context,
                    )
                else:
                    error_message = result.result
                    self._logger.info(
                        "Runtime env already failed. "
                        f"Env: {serialized_env}, "
                        f"err: {error_message}"
                    )
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                        error_message=error_message,
                    )

            if SLEEP_FOR_TESTING_S:
                self._logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.")
                time.sleep(int(SLEEP_FOR_TESTING_S))

            self._logger.info(f"Creating runtime env: {serialized_env}.")
            runtime_env_context: RuntimeEnvContext = None
            error_message = None
            runtime_env_config = RuntimeEnvConfig.from_proto(runtime_env_config)
            # accroding to the document of `asyncio.wait_for`,
            # None means disable timeout logic
            setup_timeout_seconds = (
                None
                if runtime_env_config["setup_timeout_seconds"] == -1
                else runtime_env_config["setup_timeout_seconds"]
            )
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    # python 3.6 requires the type of input is `Future`,
                    # python 3.7+ only requires the type of input is `Awaitable`
                    # TODO(Catch-Bull): remove create_task when ray drop python 3.6
                    runtime_env_setup_task = create_task(
                        _setup_runtime_env(
                            serialized_env,
                            request.serialized_allocated_resource_instances,
                        )
                    )
                    runtime_env_context = await asyncio.wait_for(
                        runtime_env_setup_task, timeout=setup_timeout_seconds
                    )
                    error_message = None
                    break
                except Exception as e:
                    err_msg = f"Failed to create runtime env {serialized_env}."
                    self._logger.exception(err_msg)
                    error_message = "".join(
                        traceback.format_exception(type(e), e, e.__traceback__)
                    )
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                self._logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES,
                )
                self._env_cache[serialized_env] = CreatedEnvResult(False, error_message)
                return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                    status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                    error_message=error_message,
                )

            serialized_context = runtime_env_context.serialize()
            self._env_cache[serialized_env] = CreatedEnvResult(True, serialized_context)
            self._logger.info(
                "Successfully created runtime env: %s, the context: %s",
                serialized_env,
                serialized_context,
            )
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context,
            )
Beispiel #4
0
        async def _create_runtime_env_with_retry(
            runtime_env,
            serialized_runtime_env,
            serialized_allocated_resource_instances,
            setup_timeout_seconds,
        ) -> Tuple[bool, str, str]:
            """
            Create runtime env with retry times. This function won't raise exceptions.

            Args:
                runtime_env(RuntimeEnv): The instance of RuntimeEnv class.
                serialized_runtime_env(str): The serialized runtime env.
                serialized_allocated_resource_instances(str): The serialized allocated
                resource instances.
                setup_timeout_seconds(int): The timeout of runtime environment creation.

            Returns:
                a tuple which contains result(bool), runtime env context(str), error
                message(str).

            """
            self._logger.info(
                f"Creating runtime env: {serialized_env} with timeout "
                f"{setup_timeout_seconds} seconds.")
            serialized_context = None
            error_message = None
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    # python 3.6 requires the type of input is `Future`,
                    # python 3.7+ only requires the type of input is `Awaitable`
                    # TODO(Catch-Bull): remove create_task when ray drop python 3.6
                    runtime_env_setup_task = create_task(
                        _setup_runtime_env(
                            runtime_env,
                            serialized_env,
                            request.serialized_allocated_resource_instances,
                        ))
                    runtime_env_context = await asyncio.wait_for(
                        runtime_env_setup_task, timeout=setup_timeout_seconds)
                    serialized_context = runtime_env_context.serialize()
                    error_message = None
                    break
                except Exception as e:
                    err_msg = f"Failed to create runtime env {serialized_env}."
                    self._logger.exception(err_msg)
                    error_message = "".join(
                        traceback.format_exception(type(e), e,
                                                   e.__traceback__))
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                self._logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES,
                )
                return False, None, error_message
            else:
                self._logger.info(
                    "Successfully created runtime env: %s, the context: %s",
                    serialized_env,
                    serialized_context,
                )
                return True, serialized_context, None