Ejemplo n.º 1
0
    def __init__(self, context, backend_addr, friendly_name, concurrency, tasks_fs: FileSystemProvider, address_host=None, external_ports=None, tmp_dir="./agent_tmp"):
        """
        :param context: ZeroMQ context for this process
        :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222")
        :param friendly_name: a string containing a friendly name to identify agent
        :param concurrency: number of simultaneous jobs that can be run by this agent
        :param tasks_fs: FileSystemProvider for the course / tasks
        :param address_host: hostname/ip/... to which external client should connect to access to the docker
        :param external_ports: iterable containing ports to which the docker instance can bind internal ports
        :param tmp_dir: temp dir that is used by the agent to start new containers
        """
        super(DockerAgent, self).__init__(context, backend_addr, friendly_name, concurrency, tasks_fs)
        self._logger = logging.getLogger("inginious.agent.docker")

        self._max_memory_per_slot = int(psutil.virtual_memory().total / concurrency / 1024 / 1024)

        self.tasks_fs = tasks_fs

        # Temp dir
        self._tmp_dir = tmp_dir

        # SSH remote debug
        self._address_host = address_host
        self._external_ports = set(external_ports) if external_ports is not None else set()

        # Async proxy to os
        self._aos = AsyncProxy(os)
        self._ashutil = AsyncProxy(shutil)
    def __init__(self, context, backend_addr, friendly_name, concurrency, tasks_fs: FileSystemProvider, address_host=None, external_ports=None, tmp_dir="./agent_tmp", runtimes=None, ssh_allowed=False):
        """
        :param context: ZeroMQ context for this process
        :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222")
        :param friendly_name: a string containing a friendly name to identify agent
        :param concurrency: number of simultaneous jobs that can be run by this agent
        :param tasks_fs: FileSystemProvider for the course / tasks
        :param address_host: hostname/ip/... to which external client should connect to access to the docker
        :param external_ports: iterable containing ports to which the docker instance can bind internal ports
        :param tmp_dir: temp dir that is used by the agent to start new containers
        :param type: type of the container ("docker" or "kata")
        :param runtime: runtime used by docker (the defaults are "runc" with docker or "kata-runtime" with kata)
        :param ssh_allowed: boolean to make this agent accept tasks with ssh or not
        """
        super(DockerAgent, self).__init__(context, backend_addr, friendly_name, concurrency, tasks_fs, ssh_allowed=ssh_allowed)

        self._runtimes = {x.envtype: x for x in runtimes} if runtimes is not None else None

        self._logger = logging.getLogger("inginious.agent.docker")

        self._max_memory_per_slot = int(psutil.virtual_memory().total / concurrency / 1024 / 1024)

        # Temp dir
        self._tmp_dir = tmp_dir

        # SSH remote debug
        self._address_host = address_host
        self._external_ports = set(external_ports) if external_ports is not None else set()

        # Async proxy to os
        self._aos = AsyncProxy(os)
        self._ashutil = AsyncProxy(shutil)
Ejemplo n.º 3
0
    async def _init_clean(self):
        """ Must be called when the agent is starting """
        # Data about running containers
        self._containers_running: Dict[str, DockerRunningJob] = {
        }  # container_id -> info
        self._container_for_job: Dict[str, str] = {}  # job id -> container_id

        self._student_containers_running: Dict[
            str, DockerRunningStudentContainer] = {}

        self._containers_killed = dict()

        # Delete tmp_dir, and recreate-it again
        try:
            await self._ashutil.rmtree(self._tmp_dir)
        except OSError:
            pass

        try:
            await self._aos.mkdir(self._tmp_dir)
        except OSError:
            pass

        # Docker
        self._docker = AsyncProxy(DockerInterface())

        if self._runtimes is None:
            self._runtimes = self._detect_runtimes()

        # Auto discover containers
        self._logger.info("Discovering containers")
        self._containers = await self._docker.get_containers(
            self._runtimes.values())

        if self._address_host is None and len(self._containers) != 0:
            self._logger.info("Guessing external host IP")
            available_bare_container_images = [
                image for envtype_containers in self._containers.values()
                for image in envtype_containers.values()
            ]
            if len(available_bare_container_images) != 0:
                self._address_host = await self._docker.get_host_ip(
                    available_bare_container_images[0]["id"])
            else:
                self._logger.error(
                    "Cannot find the external IP without at least an installed container."
                )

        if self._address_host is None:
            self._logger.warning(
                "Cannot find external host IP. Please indicate it in the configuration. "
                "Remote SSH debug has been deactivated.")
            self._external_ports = None
        else:
            self._logger.info("External address for SSH remote debug is %s",
                              self._address_host)

        # Watchers
        self._timeout_watcher = TimeoutWatcher(self._docker)
Ejemplo n.º 4
0
    async def _init_clean(self):
        """ Must be called when the agent is starting """
        # Data about running containers
        self._containers_running = {}
        self._container_for_job = {}

        self._student_containers_running = {}
        self._student_containers_for_job = {}

        self._containers_killed = dict()

        # Delete tmp_dir, and recreate-it again
        try:
            await self._ashutil.rmtree(self._tmp_dir)
        except OSError:
            pass

        try:
            await self._aos.mkdir(self._tmp_dir)
        except OSError:
            pass

        # Docker
        self._docker = AsyncProxy(DockerInterface())

        # Auto discover containers
        self._logger.info("Discovering containers")
        self._containers = await self._docker.get_containers()
        for idx in self._containers:
            self._containers[idx][
                "type"] = "docker"  # type is not given by self._docker.get_containers()

        self._assigned_external_ports = {}  # container_id : [external_ports]

        if self._address_host is None and len(self._containers) != 0:
            self._logger.info("Guessing external host IP")
            self._address_host = await self._docker.get_host_ip(
                next(iter(self._containers.values()))["id"])
        if self._address_host is None:
            self._logger.warning(
                "Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated."
            )
            self._external_ports = None
        else:
            self._logger.info("External address for SSH remote debug is %s",
                              self._address_host)

        # Watchers
        self._timeout_watcher = TimeoutWatcher(self._docker)
Ejemplo n.º 5
0
    async def _init_clean(self):
        """ Must be called when the agent is starting """
        # Data about running containers
        self._containers_running = {}
        self._container_for_job = {}

        self._student_containers_running = {}
        self._student_containers_for_job = {}

        self._containers_killed = dict()

        # Delete tmp_dir, and recreate-it again
        try:
            await self._ashutil.rmtree(self._tmp_dir)
        except OSError:
            pass

        try:
            await self._aos.mkdir(self._tmp_dir)
        except OSError:
            pass

        # Docker
        self._docker = AsyncProxy(DockerInterface())

        # Auto discover containers
        self._logger.info("Discovering containers")
        self._containers = await self._docker.get_containers()

        self._assigned_external_ports = {}  # container_id : [external_ports]

        if self._address_host is None and len(self._containers) != 0:
            self._logger.info("Guessing external host IP")
            self._address_host = await self._docker.get_host_ip(next(iter(self._containers.values()))["id"])
        if self._address_host is None:
            self._logger.warning(
                "Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated.")
            self._external_ports = None
        else:
            self._logger.info("External address for SSH remote debug is %s", self._address_host)

        # Watchers
        self._timeout_watcher = TimeoutWatcher(self._docker)
Ejemplo n.º 6
0
class DockerAgent(Agent):
    def __init__(self,
                 context,
                 backend_addr,
                 friendly_name,
                 concurrency,
                 tasks_fs: FileSystemProvider,
                 address_host=None,
                 external_ports=None,
                 tmp_dir="./agent_tmp",
                 runtimes=None):
        """
        :param context: ZeroMQ context for this process
        :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222")
        :param friendly_name: a string containing a friendly name to identify agent
        :param concurrency: number of simultaneous jobs that can be run by this agent
        :param tasks_fs: FileSystemProvider for the course / tasks
        :param address_host: hostname/ip/... to which external client should connect to access to the docker
        :param external_ports: iterable containing ports to which the docker instance can bind internal ports
        :param tmp_dir: temp dir that is used by the agent to start new containers
        :param type: type of the container ("docker" or "kata")
        :param runtime: runtime used by docker (the defaults are "runc" with docker or "kata-runtime" with kata)
        """
        super(DockerAgent, self).__init__(context, backend_addr, friendly_name,
                                          concurrency, tasks_fs)

        self._runtimes = {x.envtype: x
                          for x in runtimes} if runtimes is not None else None

        self._logger = logging.getLogger("inginious.agent.docker")

        self._max_memory_per_slot = int(psutil.virtual_memory().total /
                                        concurrency / 1024 / 1024)

        # Temp dir
        self._tmp_dir = tmp_dir

        # SSH remote debug
        self._address_host = address_host
        self._external_ports = set(
            external_ports) if external_ports is not None else set()

        # Async proxy to os
        self._aos = AsyncProxy(os)
        self._ashutil = AsyncProxy(shutil)

    async def _init_clean(self):
        """ Must be called when the agent is starting """
        # Data about running containers
        self._containers_running: Dict[str, DockerRunningJob] = {
        }  # container_id -> info
        self._container_for_job: Dict[str, str] = {}  # job id -> container_id

        self._student_containers_running: Dict[
            str, DockerRunningStudentContainer] = {}

        self._containers_killed = dict()

        # Delete tmp_dir, and recreate-it again
        try:
            await self._ashutil.rmtree(self._tmp_dir)
        except OSError:
            pass

        try:
            await self._aos.mkdir(self._tmp_dir)
        except OSError:
            pass

        # Docker
        self._docker = AsyncProxy(DockerInterface())

        if self._runtimes is None:
            self._runtimes = self._detect_runtimes()

        # Auto discover containers
        self._logger.info("Discovering containers")
        self._containers = await self._docker.get_containers(
            self._runtimes.values())

        if self._address_host is None and len(self._containers) != 0:
            self._logger.info("Guessing external host IP")
            available_bare_container_images = [
                image for envtype_containers in self._containers.values()
                for image in envtype_containers.values()
            ]
            if len(available_bare_container_images) != 0:
                self._address_host = await self._docker.get_host_ip(
                    available_bare_container_images[0]["id"])
            else:
                self._logger.error(
                    "Cannot find the external IP without at least an installed container."
                )

        if self._address_host is None:
            self._logger.warning(
                "Cannot find external host IP. Please indicate it in the configuration. "
                "Remote SSH debug has been deactivated.")
            self._external_ports = None
        else:
            self._logger.info("External address for SSH remote debug is %s",
                              self._address_host)

        # Watchers
        self._timeout_watcher = TimeoutWatcher(self._docker)

    async def _end_clean(self):
        """ Must be called when the agent is closing """
        await self._timeout_watcher.clean()

        async def close_and_delete(container_id):
            try:
                await self._docker.remove_container(container_id)
            except:
                pass

        for container_id in self._containers_running:
            await close_and_delete(container_id)
        for container_id in self._student_containers_running:
            await close_and_delete(container_id)

    @property
    def environments(self):
        return self._containers

    async def _watch_docker_events(self):
        """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """
        try:
            source = AsyncIteratorWrapper(
                self._docker.sync.event_stream(
                    filters={"event": ["die", "oom"]}))
            async for i in source:
                if i["Type"] == "container" and i["status"] == "die":
                    container_id = i["id"]
                    try:
                        retval = int(i["Actor"]["Attributes"]["exitCode"])
                    except asyncio.CancelledError:
                        raise
                    except:
                        self._logger.exception(
                            "Cannot parse exitCode for container %s",
                            container_id)
                        retval = -1

                    if container_id in self._containers_running:
                        self._create_safe_task(
                            self.handle_job_closing(container_id, retval))
                    elif container_id in self._student_containers_running:
                        self._create_safe_task(
                            self.handle_student_job_closing(
                                container_id, retval))
                elif i["Type"] == "container" and i["status"] == "oom":
                    container_id = i["id"]
                    if container_id in self._containers_running or container_id in self._student_containers_running:
                        self._logger.info("Container %s did OOM, killing it",
                                          container_id)
                        self._containers_killed[container_id] = "overflow"
                        try:
                            self._create_safe_task(
                                self._docker.kill_container(container_id))
                        except asyncio.CancelledError:
                            raise
                        except:  # this call can sometimes fail, and that is normal.
                            pass
                else:
                    raise TypeError(str(i))
        except asyncio.CancelledError:
            pass
        except:
            self._logger.exception("Exception in _watch_docker_events")

    def __new_job_sync(self, message: BackendNewJob, future_results):
        """ Synchronous part of _new_job. Creates needed directories, copy files, and starts the container. """
        course_id = message.course_id
        task_id = message.task_id

        debug = message.debug
        environment_type = message.environment_type
        environment_name = message.environment

        try:
            enable_network = message.environment_parameters.get(
                "network_grading", False)
            limits = message.environment_parameters.get("limits", {})
            time_limit = int(limits.get("time", 30))
            hard_time_limit = int(
                limits.get("hard_time", None) or time_limit * 3)
            mem_limit = int(limits.get("memory", 200))
            run_cmd = message.environment_parameters.get("run_cmd", '')
        except:
            raise CannotCreateJobException(
                'The agent is unable to parse the parameters')

        course_fs = self._fs.from_subfolder(course_id)
        task_fs = course_fs.from_subfolder(task_id)

        if not course_fs.exists() or not task_fs.exists():
            self._logger.warning("Task %s/%s unavailable on this agent",
                                 course_id, task_id)
            raise CannotCreateJobException(
                'Task unavailable on agent. Please retry later, the agents should synchronize soon. '
                'If the error persists, please contact your course administrator.'
            )

        # Check for realistic memory limit value
        if mem_limit < 20:
            mem_limit = 20
        elif mem_limit > self._max_memory_per_slot:
            self._logger.warning(
                "Task %s/%s ask for too much memory (%dMB)! Available: %dMB",
                course_id, task_id, mem_limit, self._max_memory_per_slot)
            raise CannotCreateJobException(
                'Not enough memory on agent (available: %dMB). Please contact your course administrator.'
                % self._max_memory_per_slot)

        if environment_type not in self._containers or environment_name not in self._containers[
                environment_type]:
            self._logger.warning(
                "Task %s/%s ask for an unknown environment %s/%s", course_id,
                task_id, environment_type, environment_name)
            raise CannotCreateJobException(
                'Unknown container. Please contact your course administrator.')

        environment = self._containers[environment_type][environment_name][
            "id"]
        runtime = self._containers[environment_type][environment_name][
            "runtime"]

        ports_needed = list(
            self._containers[environment_type][environment_name]
            ["ports"])  # copy, as we modify it later!

        if debug == "ssh" and 22 not in ports_needed:
            ports_needed.append(22)

        ports = {}
        if len(ports_needed) > 0:
            time_limit = 30 * 60
            hard_time_limit = 30 * 60
        for p in ports_needed:
            if len(self._external_ports) == 0:
                self._logger.warning(
                    "User asked for a port but no one are available")
                raise CannotCreateJobException(
                    'No ports are available right now. Please retry later.')
            ports[p] = self._external_ports.pop()

        # Create directories for storing all the data for the job
        try:
            container_path = tempfile.mkdtemp(dir=self._tmp_dir)
        except Exception as e:
            self._logger.error("Cannot make container temp directory! %s",
                               str(e),
                               exc_info=True)
            for p in ports:
                self._external_ports.add(ports[p])
            raise CannotCreateJobException(
                'Cannot make container temp directory.')

        task_path = path_join(container_path, 'task')  # tmp_dir/id/task/
        course_path = path_join(container_path, 'course')

        sockets_path = path_join(container_path,
                                 'sockets')  # tmp_dir/id/socket/
        student_path = path_join(task_path,
                                 'student')  # tmp_dir/id/task/student/
        systemfiles_path = path_join(
            task_path, 'systemfiles')  # tmp_dir/id/task/systemfiles/

        course_common_path = path_join(course_path, 'common')
        course_common_student_path = path_join(course_path, 'common',
                                               'student')

        # Create the needed directories
        os.mkdir(sockets_path)
        os.chmod(container_path, 0o777)
        os.chmod(sockets_path, 0o777)
        os.mkdir(course_path)

        # TODO: avoid copy
        task_fs.copy_from(None, task_path)
        os.chmod(task_path, 0o777)

        if not os.path.exists(student_path):
            os.mkdir(student_path)
            os.chmod(student_path, 0o777)

        # Copy common and common/student if needed
        # TODO: avoid copy
        if course_fs.from_subfolder("$common").exists():
            course_fs.from_subfolder("$common").copy_from(
                None, course_common_path)
        else:
            os.mkdir(course_common_path)

        if course_fs.from_subfolder("$common").from_subfolder(
                "student").exists():
            course_fs.from_subfolder("$common").from_subfolder(
                "student").copy_from(None, course_common_student_path)
        else:
            os.mkdir(course_common_student_path)

        # Run the container
        try:
            container_id = self._docker.sync.create_container(
                environment, enable_network, mem_limit, task_path,
                sockets_path, course_common_path, course_common_student_path,
                runtime, ports)
        except Exception as e:
            self._logger.warning("Cannot create container! %s",
                                 str(e),
                                 exc_info=True)
            shutil.rmtree(container_path)
            for p in ports:
                self._external_ports.add(ports[p])
            raise CannotCreateJobException('Cannot create container.')

        # Store info
        info = DockerRunningJob(
            message=message,
            container_path=container_path,
            future_results=future_results,
            job_id=message.job_id,
            container_id=container_id,
            inputdata=message.inputdata,
            debug=debug,
            ports=ports,
            environment_type=environment_type,
            environment_name=environment_name,
            mem_limit=mem_limit,
            time_limit=time_limit,
            hard_time_limit=hard_time_limit,
            sockets_path=sockets_path,
            student_path=student_path,
            systemfiles_path=systemfiles_path,
            course_common_student_path=course_common_student_path,
            run_cmd=run_cmd,
            assigned_external_ports=list(ports.values()),
            student_containers=set())

        self._containers_running[container_id] = info
        self._container_for_job[message.job_id] = container_id

        try:
            # Start the container
            self._docker.sync.start_container(container_id)
        except Exception as e:
            self._logger.warning("Cannot start container! %s",
                                 str(e),
                                 exc_info=True)
            shutil.rmtree(container_path)
            for p in ports:
                self._external_ports.add(ports[p])

            raise CannotCreateJobException('Cannot start container')

        return info

    async def new_job(self, message: BackendNewJob):
        """
        Handles a new job: starts the grading container
        """
        future_results = asyncio.Future()
        out = await self._loop.run_in_executor(
            None, lambda: self.__new_job_sync(message, future_results))
        self._create_safe_task(
            self.handle_running_container(out, future_results=future_results))
        await self._timeout_watcher.register_container(out.container_id,
                                                       out.time_limit,
                                                       out.hard_time_limit)

    async def create_student_container(self, parent_info, socket_id,
                                       environment_name, memory_limit,
                                       time_limit, hard_time_limit,
                                       share_network, write_stream):
        """
        Creates a new student container.
        :param write_stream: stream on which to write the return value of the container (with a correctly formatted msgpack message)
        """
        try:
            environment_type = parent_info.environment_type
            self._logger.debug(
                "Starting new student container... %s/%s %s %s %s",
                environment_type, environment_name, memory_limit, time_limit,
                hard_time_limit)

            if environment_type not in self._containers or environment_name not in self._containers[
                    environment_type]:
                self._logger.warning(
                    "Student container asked for an unknown environment %s/%s",
                    environment_type, environment_name)
                await self._write_to_container_stdin(
                    write_stream, {
                        "type": "run_student_retval",
                        "retval": 254,
                        "socket_id": socket_id
                    })
                return

            environment = self._containers[environment_type][environment_name][
                "id"]
            runtime = self._containers[environment_type][environment_name][
                "runtime"]

            try:
                socket_path = path_join(parent_info.sockets_path,
                                        str(socket_id) + ".sock")
                container_id = await self._docker.create_container_student(
                    runtime, environment, memory_limit,
                    parent_info.student_path, socket_path,
                    parent_info.systemfiles_path,
                    parent_info.course_common_student_path,
                    parent_info.container_id if share_network else None)
            except Exception as e:
                self._logger.exception("Cannot create student container!")
                await self._write_to_container_stdin(
                    write_stream, {
                        "type": "run_student_retval",
                        "retval": 254,
                        "socket_id": socket_id
                    })

                if isinstance(e, asyncio.CancelledError):
                    raise

                return

            info = DockerRunningStudentContainer(container_id=container_id,
                                                 parent_info=parent_info,
                                                 socket_id=socket_id,
                                                 write_stream=write_stream)

            parent_info.student_containers.add(container_id)
            self._student_containers_running[container_id] = info

            # send to the container that the sibling has started
            await self._write_to_container_stdin(write_stream, {
                "type": "run_student_started",
                "socket_id": socket_id
            })

            try:
                await self._docker.start_container(container_id)
            except Exception as e:
                self._logger.exception("Cannot start student container!")
                await self._write_to_container_stdin(
                    write_stream, {
                        "type": "run_student_retval",
                        "retval": 254,
                        "socket_id": socket_id
                    })

                if isinstance(e, asyncio.CancelledError):
                    raise

                return

            # Verify the time limit
            await self._timeout_watcher.register_container(
                container_id, time_limit, hard_time_limit)
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in create_student_container")

    async def _write_to_container_stdin(self, write_stream, message):
        """
        Send a message to the stdin of a container, with the right data
        :param write_stream: asyncio write stream to the stdin of the container
        :param message: dict to be msgpacked and sent
        """
        msg = msgpack.dumps(message, use_bin_type=True)
        self._logger.debug("Sending %i bytes to container", len(msg))
        write_stream.write(struct.pack('I', len(msg)))
        write_stream.write(msg)
        await write_stream.drain()

    async def handle_running_container(self, info: DockerRunningJob,
                                       future_results):
        """ Talk with a container. Sends the initial input. Allows to start student containers """
        sock = await self._docker.attach_to_container(info.container_id)
        try:
            read_stream, write_stream = await asyncio.open_connection(
                sock=sock.get_socket())
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception(
                "Exception occurred while creating read/write stream to container"
            )
            return None

        # Send hello msg
        hello_msg = {
            "type": "start",
            "input": info.inputdata,
            "debug": info.debug,
            "envtypes":
            {x.envtype: x.shared_kernel
             for x in self._runtimes.values()}
        }
        if info.run_cmd is not None:
            hello_msg["run_cmd"] = info.run_cmd
        hello_msg["run_as_root"] = self._runtimes[
            info.environment_type].run_as_root
        hello_msg["shared_kernel"] = self._runtimes[
            info.environment_type].shared_kernel

        await self._write_to_container_stdin(write_stream, hello_msg)
        result = None

        buffer = bytearray()
        try:
            while not read_stream.at_eof():
                msg_header = await read_stream.readexactly(8)
                outtype, length = struct.unpack_from(
                    '>BxxxL', msg_header
                )  # format imposed by docker in the attach endpoint
                if length != 0:
                    content = await read_stream.readexactly(length)
                    if outtype == 1:  # stdout
                        buffer += content

                    if outtype == 2:  # stderr
                        self._logger.debug(
                            "Received stderr from containers:\n%s", content)

                    # 4 first bytes are the length of the message. If we have a complete message...
                    while len(buffer) > 4 and len(buffer) >= 4 + struct.unpack(
                            'I', buffer[0:4])[0]:
                        msg_encoded = buffer[
                            4:4 +
                            struct.unpack('I', buffer[0:4])[0]]  # ... get it
                        buffer = buffer[
                            4 + struct.unpack('I', buffer[0:4])
                            [0]:]  # ... withdraw it from the buffer
                        try:
                            msg = msgpack.unpackb(msg_encoded, use_list=False)
                            self._logger.debug(
                                "Received msg %s from container %s",
                                msg["type"], info.container_id)
                            if msg["type"] == "run_student":
                                # start a new student container
                                environment = msg[
                                    "environment"] or info.environment_name
                                memory_limit = min(
                                    msg["memory_limit"] or info.mem_limit,
                                    info.mem_limit)
                                time_limit = min(
                                    msg["time_limit"] or info.time_limit,
                                    info.time_limit)
                                hard_time_limit = min(
                                    msg["hard_time_limit"]
                                    or info.hard_time_limit,
                                    info.hard_time_limit)
                                share_network = msg["share_network"]
                                socket_id = msg["socket_id"]
                                assert "/" not in socket_id  # ensure task creator do not try to break the agent :-(
                                self._create_safe_task(
                                    self.create_student_container(
                                        info, socket_id, environment,
                                        memory_limit, time_limit,
                                        hard_time_limit, share_network,
                                        write_stream))
                            elif msg["type"] == "ssh_key":
                                # send the data to the backend (and client)
                                self._logger.info("%s %s", info.container_id,
                                                  str(msg))
                                await self.send_ssh_job_info(
                                    info.job_id, self._address_host,
                                    info.ports[22], msg["ssh_user"],
                                    msg["ssh_key"])
                            elif msg["type"] == "result":
                                # last message containing the results of the container
                                result = msg["result"]
                        except:
                            self._logger.exception(
                                "Received incorrect message from container %s (job id %s)",
                                info.container_id, info.job_id)
        except asyncio.IncompleteReadError:
            self._logger.debug(
                "Container output ended with an IncompleteReadError; It was probably killed."
            )
        except asyncio.CancelledError:
            write_stream.close()
            sock.close_socket()
            future_results.set_result(result)
            raise
        except:
            self._logger.exception(
                "Exception while reading container %s output",
                info.container_id)

        write_stream.close()
        sock.close_socket()
        future_results.set_result(result)

        if not result:
            self._logger.warning("Container %s has not given any result",
                                 info.container_id)

    async def handle_student_job_closing(self, container_id, retval):
        """
        Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the associated grading
        container
        """
        try:
            self._logger.debug("Closing student %s", container_id)
            try:
                info = self._student_containers_running[container_id]
                del self._student_containers_running[container_id]
            except asyncio.CancelledError:
                raise
            except:
                self._logger.warning(
                    "Student container %s that has finished(p1) was not launched by this agent",
                    str(container_id),
                    exc_info=True)
                return

            # Delete remaining student containers
            info.parent_info.student_containers.remove(container_id)

            killed = await self._timeout_watcher.was_killed(container_id)
            if container_id in self._containers_killed:
                killed = self._containers_killed[container_id]
                del self._containers_killed[container_id]

            if killed == "timeout":
                retval = 253
            elif killed == "overflow":
                retval = 252

            try:
                await self._write_to_container_stdin(
                    info.write_stream, {
                        "type": "run_student_retval",
                        "retval": retval,
                        "socket_id": info.socket_id
                    })
            except asyncio.CancelledError:
                raise
            except:
                pass  # parent container closed

            # Do not forget to remove the container
            try:
                await self._docker.remove_container(container_id)
            except asyncio.CancelledError:
                raise
            except:
                pass  # ignore
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_student_job_closing")

    async def handle_job_closing(self, container_id, retval):
        """
        Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the backend
        """
        try:
            self._logger.debug("Closing %s", container_id)
            try:
                info = self._containers_running[container_id]
                del self._containers_running[container_id]
            except asyncio.CancelledError:
                raise
            except:
                self._logger.warning(
                    "Container %s that has finished(p1) was not launched by this agent",
                    str(container_id),
                    exc_info=True)
                return

            # Close sub containers
            for student_container_id_loop in info.student_containers:
                # little hack to ensure the value of student_container_id_loop is copied into the closure
                async def close_and_delete(
                        student_container_id=student_container_id_loop):
                    try:
                        await self._docker.kill_container(student_container_id)
                        await self._docker.remove_container(
                            student_container_id)
                    except asyncio.CancelledError:
                        raise
                    except:
                        pass  # ignore

                self._create_safe_task(
                    close_and_delete(student_container_id_loop))

            # Allow other container to reuse the external ports this container has finished to use
            for p in info.assigned_external_ports:
                self._external_ports.add(p)

            # Verify if the container was killed, either by the client, by an OOM or by a timeout
            killed = await self._timeout_watcher.was_killed(container_id)
            if container_id in self._containers_killed:
                killed = self._containers_killed[container_id]
                del self._containers_killed[container_id]

            stdout = ""
            stderr = ""
            result = "crash" if retval == -1 else None
            error_msg = None
            grade = None
            problems = {}
            custom = {}
            tests = {}
            archive = None
            state = ""

            if killed is not None:
                result = killed

            # If everything did well, continue to retrieve the status from the container
            if result is None:
                # Get logs back
                try:
                    return_value = await info.future_results

                    # Accepted types for return dict
                    accepted_types = {
                        "stdout": str,
                        "stderr": str,
                        "result": str,
                        "text": str,
                        "grade": float,
                        "problems": dict,
                        "custom": dict,
                        "tests": dict,
                        "state": str,
                        "archive": str
                    }

                    keys_fct = {
                        "problems": id_checker,
                        "custom": id_checker,
                        "tests": id_checker_tests
                    }

                    # Check dict content
                    for key, item in return_value.items():
                        if not isinstance(item, accepted_types[key]):
                            raise Exception(
                                "Feedback file is badly formatted.")
                        elif accepted_types[
                                key] == dict and key != "custom":  #custom can contain anything:
                            for sub_key, sub_item in item.items():
                                if not keys_fct[key](sub_key) or isinstance(
                                        sub_item, dict):
                                    raise Exception(
                                        "Feedback file is badly formatted.")

                    # Set output fields
                    stdout = return_value.get("stdout", "")
                    stderr = return_value.get("stderr", "")
                    result = return_value.get("result", "error")
                    error_msg = return_value.get("text", "")
                    grade = return_value.get("grade", None)
                    problems = return_value.get("problems", {})
                    custom = return_value.get("custom", {})
                    tests = return_value.get("tests", {})
                    state = return_value.get("state", "")
                    archive = return_value.get("archive", None)
                    if archive is not None:
                        archive = base64.b64decode(archive)
                except Exception as e:
                    self._logger.exception(
                        "Cannot get back output of container %s! (%s)",
                        container_id, str(e))
                    result = "crash"
                    error_msg = 'The grader did not return a readable output : {}'.format(
                        str(e))

            # Default values
            if error_msg is None:
                error_msg = ""
            if grade is None:
                if result == "success":
                    grade = 100.0
                else:
                    grade = 0.0

            # Remove container
            try:
                await self._docker.remove_container(container_id)
            except asyncio.CancelledError:
                raise
            except:
                pass

            # Delete folders
            try:
                await self._ashutil.rmtree(info.container_path)
            except PermissionError:
                self._logger.debug("Cannot remove old container path!")
                pass  # todo: run a docker container to force removal

            # Return!
            await self.send_job_result(info.job_id, result, error_msg, grade,
                                       problems, tests, custom, state, archive,
                                       stdout, stderr)

            # Do not forget to remove data from internal state
            del self._container_for_job[info.job_id]
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_job_closing")

    async def kill_job(self, message: BackendKillJob):
        """ Handles `kill` messages. Kill things. """
        try:
            if message.job_id in self._container_for_job:
                self._containers_killed[self._container_for_job[
                    message.job_id]] = "killed"
                await self._docker.kill_container(
                    self._container_for_job[message.job_id])
            else:
                self._logger.warning(
                    "Cannot kill container for job %s because it is not running",
                    str(message.job_id))
                # Ensure the backend/frontend receive the info that the job is done. This will be ignored in the worst
                # case.
                await self.send_job_result(message.job_id, "killed")
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_kill_job")

    async def run(self):
        await self._init_clean()

        # Init Docker events watcher
        watcher_docker_event = self._create_safe_task(
            self._watch_docker_events())

        try:
            await super(DockerAgent, self).run()
        except:
            await self._end_clean()
            raise

    def _detect_runtimes(self) -> Dict[str, DockerRuntime]:
        heuristic = [
            ("runc", lambda x: DockerRuntime(runtime=x,
                                             run_as_root=False,
                                             shared_kernel=True,
                                             envtype="docker")),
            ("crun", lambda x: DockerRuntime(runtime=x,
                                             run_as_root=False,
                                             shared_kernel=True,
                                             envtype="docker")),
            ("kata", lambda x: DockerRuntime(runtime=x,
                                             run_as_root=True,
                                             shared_kernel=False,
                                             envtype="kata")),
        ]
        retval = {}

        for runtime in self._docker.sync.list_runtimes().keys():
            for h_runtime, f in heuristic:
                if h_runtime in runtime:
                    v = f(runtime)
                    if v.envtype not in retval:
                        self._logger.info(
                            "Using %s as runtime with parameters %s", runtime,
                            str(v))
                        retval[v.envtype] = v
                    else:
                        self._logger.warning(
                            "%s was detected as a runtime; it would duplicate another one, so we ignore it. %s",
                            runtime, str(v))
        return retval
Ejemplo n.º 7
0
class DockerAgent(Agent):
    def __init__(self, context, backend_addr, friendly_name, concurrency, tasks_fs: FileSystemProvider, address_host=None, external_ports=None, tmp_dir="./agent_tmp"):
        """
        :param context: ZeroMQ context for this process
        :param backend_addr: address of the backend (for example, "tcp://127.0.0.1:2222")
        :param friendly_name: a string containing a friendly name to identify agent
        :param concurrency: number of simultaneous jobs that can be run by this agent
        :param tasks_fs: FileSystemProvider for the course / tasks
        :param address_host: hostname/ip/... to which external client should connect to access to the docker
        :param external_ports: iterable containing ports to which the docker instance can bind internal ports
        :param tmp_dir: temp dir that is used by the agent to start new containers
        """
        super(DockerAgent, self).__init__(context, backend_addr, friendly_name, concurrency, tasks_fs)
        self._logger = logging.getLogger("inginious.agent.docker")

        self._max_memory_per_slot = int(psutil.virtual_memory().total / concurrency / 1024 / 1024)

        self.tasks_fs = tasks_fs

        # Temp dir
        self._tmp_dir = tmp_dir

        # SSH remote debug
        self._address_host = address_host
        self._external_ports = set(external_ports) if external_ports is not None else set()

        # Async proxy to os
        self._aos = AsyncProxy(os)
        self._ashutil = AsyncProxy(shutil)

    async def _init_clean(self):
        """ Must be called when the agent is starting """
        # Data about running containers
        self._containers_running = {}
        self._container_for_job = {}

        self._student_containers_running = {}
        self._student_containers_for_job = {}

        self._containers_killed = dict()

        # Delete tmp_dir, and recreate-it again
        try:
            await self._ashutil.rmtree(self._tmp_dir)
        except OSError:
            pass

        try:
            await self._aos.mkdir(self._tmp_dir)
        except OSError:
            pass

        # Docker
        self._docker = AsyncProxy(DockerInterface())

        # Auto discover containers
        self._logger.info("Discovering containers")
        self._containers = await self._docker.get_containers()

        self._assigned_external_ports = {}  # container_id : [external_ports]

        if self._address_host is None and len(self._containers) != 0:
            self._logger.info("Guessing external host IP")
            self._address_host = await self._docker.get_host_ip(next(iter(self._containers.values()))["id"])
        if self._address_host is None:
            self._logger.warning(
                "Cannot find external host IP. Please indicate it in the configuration. Remote SSH debug has been deactivated.")
            self._external_ports = None
        else:
            self._logger.info("External address for SSH remote debug is %s", self._address_host)

        # Watchers
        self._timeout_watcher = TimeoutWatcher(self._docker)

    async def _end_clean(self):
        """ Must be called when the agent is closing """
        await self._timeout_watcher.clean()

        async def close_and_delete(container_id):
            try:
                await self._docker.remove_container(container_id)
            except:
                pass

        for container_id  in self._containers_running:
            await close_and_delete(container_id)
        for container_id  in self._student_containers_running:
            await close_and_delete(container_id)

    @property
    def environments(self):
        return self._containers

    async def _watch_docker_events(self):
        """ Get raw docker events and convert them to more readable objects, and then give them to self._docker_events_subscriber """
        try:
            source = AsyncIteratorWrapper(self._docker.sync.event_stream(filters={"event": ["die", "oom"]}))
            async for i in source:
                if i["Type"] == "container" and i["status"] == "die":
                    container_id = i["id"]
                    try:
                        retval = int(i["Actor"]["Attributes"]["exitCode"])
                    except asyncio.CancelledError:
                        raise
                    except:
                        self._logger.exception("Cannot parse exitCode for container %s", container_id)
                        retval = -1

                    if container_id in self._containers_running:
                        self._create_safe_task(self.handle_job_closing(container_id, retval))
                    elif container_id in self._student_containers_running:
                        self._create_safe_task(self.handle_student_job_closing(container_id, retval))
                elif i["Type"] == "container" and i["status"] == "oom":
                    container_id = i["id"]
                    if container_id in self._containers_running or container_id in self._student_containers_running:
                        self._logger.info("Container %s did OOM, killing it", container_id)
                        self._containers_killed[container_id] = "overflow"
                        try:
                            self._create_safe_task(self._docker.kill_container(container_id))
                        except asyncio.CancelledError:
                            raise
                        except:  # this call can sometimes fail, and that is normal.
                            pass
                else:
                    raise TypeError(str(i))
        except asyncio.CancelledError:
            pass
        except:
            self._logger.exception("Exception in _watch_docker_events")

    def __new_job_sync(self, message, future_results):
        """ Synchronous part of _new_job. Creates needed directories, copy files, and starts the container. """
        course_id = message.course_id
        task_id = message.task_id

        debug = message.debug
        environment_name = message.environment
        enable_network = message.enable_network
        time_limit = message.time_limit
        hard_time_limit = message.hard_time_limit or time_limit * 3
        mem_limit = message.mem_limit

        course_fs = self.tasks_fs.from_subfolder(course_id)
        task_fs = course_fs.from_subfolder(task_id)

        if not course_fs.exists() or not task_fs.exists():
            self._logger.warning("Task %s/%s unavailable on this agent", course_id, task_id)
            raise CannotCreateJobException('Task unavailable on agent. Please retry later, the agents should synchronize soon. '
                             'If the error persists, please contact your course administrator.')

        # Check for realistic memory limit value
        if mem_limit < 20:
            mem_limit = 20
        elif mem_limit > self._max_memory_per_slot:
            self._logger.warning("Task %s/%s ask for too much memory (%dMB)! Available: %dMB", course_id, task_id,
                                 mem_limit, self._max_memory_per_slot)
            raise CannotCreateJobException('Not enough memory on agent (available: %dMB). Please contact your course administrator.' % self._max_memory_per_slot)

        if environment_name not in self._containers:
            self._logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id,
                                 environment_name)
            raise CannotCreateJobException('Unknown container. Please contact your course administrator.')

        environment = self._containers[environment_name]["id"]

        ports_needed = self._containers[environment_name]["ports"]

        if debug == "ssh" and 22 not in ports_needed:
            ports_needed.append(22)

        ports = {}
        if len(ports_needed) > 0:
            time_limit = 30 * 60
            hard_time_limit = 30 * 60
        for p in ports_needed:
            if len(self._external_ports) == 0:
                self._logger.warning("User asked for a port but no one are available")
                raise CannotCreateJobException('No ports are available right now. Please retry later.')
            ports[p] = self._external_ports.pop()

        # Create directories for storing all the data for the job
        try:
            container_path = tempfile.mkdtemp(dir=self._tmp_dir)
        except Exception as e:
            self._logger.error("Cannot make container temp directory! %s", str(e), exc_info=True)
            for p in ports:
                self._external_ports.add(ports[p])
            raise CannotCreateJobException('Cannot make container temp directory.')

        task_path = path_join(container_path, 'task')  # tmp_dir/id/task/
        course_path = path_join(container_path, 'course')

        sockets_path = path_join(container_path, 'sockets')  # tmp_dir/id/socket/
        student_path = path_join(task_path, 'student')  # tmp_dir/id/task/student/
        systemfiles_path = path_join(task_path, 'systemfiles')  # tmp_dir/id/task/systemfiles/

        course_common_path = path_join(course_path, 'common')
        course_common_student_path = path_join(course_path, 'common', 'student')

        # Create the needed directories
        os.mkdir(sockets_path)
        os.chmod(container_path, 0o777)
        os.chmod(sockets_path, 0o777)
        os.mkdir(course_path)

        # TODO: avoid copy
        task_fs.copy_from(None, task_path)
        os.chmod(task_path, 0o777)

        if not os.path.exists(student_path):
            os.mkdir(student_path)
            os.chmod(student_path, 0o777)

        # Copy common and common/student if needed
        # TODO: avoid copy
        if course_fs.from_subfolder("$common").exists():
            course_fs.from_subfolder("$common").copy_from(None, course_common_path)
        else:
            os.mkdir(course_common_path)

        if course_fs.from_subfolder("$common").from_subfolder("student").exists():
            course_fs.from_subfolder("$common").from_subfolder("student").copy_from(None, course_common_student_path)
        else:
            os.mkdir(course_common_student_path)

        # Run the container
        try:
            container_id = self._docker.sync.create_container(environment, enable_network, mem_limit, task_path,
                                                              sockets_path, course_common_path,
                                                              course_common_student_path, ports)
        except Exception as e:
            self._logger.warning("Cannot create container! %s", str(e), exc_info=True)
            shutil.rmtree(container_path)
            for p in ports:
                self._external_ports.add(ports[p])
            raise CannotCreateJobException('Cannot create container.')

        # Store info
        self._containers_running[container_id] = message, container_path, future_results
        self._container_for_job[message.job_id] = container_id
        self._student_containers_for_job[message.job_id] = set()

        if len(ports) != 0:
            self._assigned_external_ports[container_id] = list(ports.values())

        try:
            # Start the container
            self._docker.sync.start_container(container_id)
        except Exception as e:
            self._logger.warning("Cannot start container! %s", str(e), exc_info=True)
            shutil.rmtree(container_path)
            for p in ports:
                self._external_ports.add(ports[p])

            raise CannotCreateJobException('Cannot start container')

        return {
            "job_id": message.job_id,
            "container_id": container_id,
            "inputdata": message.inputdata,
            "debug": debug,
            "ports": ports,
            "orig_env": environment_name,
            "orig_memory_limit": mem_limit,
            "orig_time_limit": time_limit,
            "orig_hard_time_limit": hard_time_limit,
            "sockets_path": sockets_path,
            "student_path": student_path,
            "systemfiles_path": systemfiles_path,
            "course_common_student_path": course_common_student_path
        }

    async def new_job(self, message: BackendNewJob):
        """
        Handles a new job: starts the grading container
        """
        self._logger.info("Received request for jobid %s", message.job_id)
        future_results = asyncio.Future()
        out = await self._loop.run_in_executor(None, lambda: self.__new_job_sync(message, future_results))
        self._create_safe_task(self.handle_running_container(**out, future_results=future_results))
        await self._timeout_watcher.register_container(out["container_id"], out["orig_time_limit"], out["orig_hard_time_limit"])

    async def create_student_container(self, job_id, parent_container_id, sockets_path, student_path, systemfiles_path,
                                       course_common_student_path, socket_id,  environment_name, memory_limit,
                                       time_limit, hard_time_limit, share_network, write_stream):
        """
        Creates a new student container.
        :param write_stream: stream on which to write the return value of the container (with a correctly formatted msgpack message)
        """
        try:
            self._logger.debug("Starting new student container... %s %s %s %s", environment_name, memory_limit, time_limit, hard_time_limit)

            if environment_name not in self._containers:
                self._logger.warning("Student container asked for an unknown environment %s (not in aliases)", environment_name)
                await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id})
                return

            environment = self._containers[environment_name]["id"]

            try:
                socket_path = path_join(sockets_path, str(socket_id) + ".sock")
                container_id = await self._docker.create_container_student(parent_container_id, environment, share_network,
                                                                           memory_limit, student_path, socket_path,
                                                                           systemfiles_path, course_common_student_path)
            except Exception as e:
                self._logger.exception("Cannot create student container!")
                await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id})

                if isinstance(e, asyncio.CancelledError):
                    raise

                return

            self._student_containers_for_job[job_id].add(container_id)
            self._student_containers_running[container_id] = job_id, parent_container_id, socket_id, write_stream

            # send to the container that the sibling has started
            await self._write_to_container_stdin(write_stream, {"type": "run_student_started", "socket_id": socket_id})

            try:
                await self._docker.start_container(container_id)
            except Exception as e:
                self._logger.exception("Cannot start student container!")
                await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": 254, "socket_id": socket_id})

                if isinstance(e, asyncio.CancelledError):
                    raise

                return

            # Verify the time limit
            await self._timeout_watcher.register_container(container_id, time_limit, hard_time_limit)
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in create_student_container")

    async def _write_to_container_stdin(self, write_stream, message):
        """
        Send a message to the stdin of a container, with the right data
        :param write_stream: asyncio write stream to the stdin of the container
        :param message: dict to be msgpacked and sent
        """
        msg = msgpack.dumps(message, encoding="utf8", use_bin_type=True)
        self._logger.debug("Sending %i bytes to container", len(msg))
        write_stream.write(struct.pack('I', len(msg)))
        write_stream.write(msg)
        await write_stream.drain()

    async def handle_running_container(self, job_id, container_id, inputdata, debug, ports, orig_env,
                                       orig_memory_limit, orig_time_limit, orig_hard_time_limit, sockets_path,
                                       student_path, systemfiles_path, course_common_student_path, future_results):
        """ Talk with a container. Sends the initial input. Allows to start student containers """
        sock = await self._docker.attach_to_container(container_id)
        try:
            read_stream, write_stream = await asyncio.open_connection(sock=sock.get_socket())
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception occurred while creating read/write stream to container")
            return None

        # Send hello msg
        await self._write_to_container_stdin(write_stream, {"type": "start", "input": inputdata, "debug": debug})
        result = None

        buffer = bytearray()
        try:
            while not read_stream.at_eof():
                msg_header = await read_stream.readexactly(8)
                outtype, length = struct.unpack_from('>BxxxL', msg_header)  # format imposed by docker in the attach endpoint
                if length != 0:
                    content = await read_stream.readexactly(length)
                    if outtype == 1:  # stdout
                        buffer += content

                    if outtype == 2:  # stderr
                        self._logger.debug("Received stderr from containers:\n%s", content)

                    # 4 first bytes are the lenght of the message. If we have a complete message...
                    while len(buffer) > 4 and len(buffer) >= 4+struct.unpack('I',buffer[0:4])[0]:
                        msg_encoded = buffer[4:4 + struct.unpack('I', buffer[0:4])[0]]  # ... get it
                        buffer = buffer[4 + struct.unpack('I', buffer[0:4])[0]:]  # ... withdraw it from the buffer
                        try:
                            msg = msgpack.unpackb(msg_encoded, encoding="utf8", use_list=False)
                            self._logger.debug("Received msg %s from container %s", msg["type"], container_id)
                            if msg["type"] == "run_student":
                                # start a new student container
                                environment = msg["environment"] or orig_env
                                memory_limit = min(msg["memory_limit"] or orig_memory_limit, orig_memory_limit)
                                time_limit = min(msg["time_limit"] or orig_time_limit, orig_time_limit)
                                hard_time_limit = min(msg["hard_time_limit"] or orig_hard_time_limit, orig_hard_time_limit)
                                share_network = msg["share_network"]
                                socket_id = msg["socket_id"]
                                assert "/" not in socket_id  # ensure task creator do not try to break the agent :-(
                                self._create_safe_task(self.create_student_container(job_id, container_id, sockets_path, student_path,
                                                                                     systemfiles_path, course_common_student_path,
                                                                                     socket_id,  environment, memory_limit, time_limit,
                                                                                     hard_time_limit, share_network, write_stream))
                            elif msg["type"] == "ssh_key":
                                # send the data to the backend (and client)
                                self._logger.info("%s %s", container_id, str(msg))
                                await self.send_ssh_job_info(job_id, self._address_host, ports[22], msg["ssh_key"])
                            elif msg["type"] == "result":
                                # last message containing the results of the container
                                result = msg["result"]
                        except:
                            self._logger.exception("Received incorrect message from container %s (job id %s)", container_id, job_id)
        except asyncio.IncompleteReadError:
            self._logger.debug("Container output ended with an IncompleteReadError; It was probably killed.")
        except asyncio.CancelledError:
            write_stream.close()
            sock.close_socket()
            future_results.set_result(result)
            raise
        except:
            self._logger.exception("Exception while reading container %s output", container_id)

        write_stream.close()
        sock.close_socket()
        future_results.set_result(result)

        if not result:
            self._logger.warning("Container %s has not given any result", container_id)

    async def handle_student_job_closing(self, container_id, retval):
        """
        Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the associated grading
        container
        """
        try:
            self._logger.debug("Closing student %s", container_id)
            try:
                job_id, parent_container_id, socket_id, write_stream = self._student_containers_running[container_id]
                del self._student_containers_running[container_id]
            except asyncio.CancelledError:
                raise
            except:
                self._logger.warning("Student container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True)
                return

            # Delete remaining student containers
            if job_id in self._student_containers_for_job:  # if it does not exists, then the parent container has closed
                self._student_containers_for_job[job_id].remove(container_id)

            killed = await self._timeout_watcher.was_killed(container_id)
            if container_id in self._containers_killed:
                killed = self._containers_killed[container_id]
                del self._containers_killed[container_id]

            if killed == "timeout":
                retval = 253
            elif killed == "overflow":
                retval = 252

            try:
                await self._write_to_container_stdin(write_stream, {"type": "run_student_retval", "retval": retval, "socket_id": socket_id})
            except asyncio.CancelledError:
                raise
            except:
                pass  # parent container closed

            # Do not forget to remove the container
            try:
                await self._docker.remove_container(container_id)
            except asyncio.CancelledError:
                raise
            except:
                pass  # ignore
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_student_job_closing")

    async def handle_job_closing(self, container_id, retval):
        """
        Handle a closing student container. Do some cleaning, verify memory limits, timeouts, ... and returns data to the backend
        """
        try:
            self._logger.debug("Closing %s", container_id)
            try:
                message, container_path, future_results = self._containers_running[container_id]
                del self._containers_running[container_id]
            except asyncio.CancelledError:
                raise
            except:
                self._logger.warning("Container %s that has finished(p1) was not launched by this agent", str(container_id), exc_info=True)
                return

            # Close sub containers
            for student_container_id_loop in self._student_containers_for_job[message.job_id]:
                # little hack to ensure the value of student_container_id_loop is copied into the closure
                async def close_and_delete(student_container_id=student_container_id_loop):
                    try:
                        await self._docker.kill_container(student_container_id)
                        await self._docker.remove_container(student_container_id)
                    except asyncio.CancelledError:
                        raise
                    except:
                        pass  # ignore
                self._create_safe_task(close_and_delete(student_container_id_loop))
            del self._student_containers_for_job[message.job_id]

            # Allow other container to reuse the external ports this container has finished to use
            if container_id in self._assigned_external_ports:
                for p in self._assigned_external_ports[container_id]:
                    self._external_ports.add(p)
                del self._assigned_external_ports[container_id]

            # Verify if the container was killed, either by the client, by an OOM or by a timeout
            killed = await self._timeout_watcher.was_killed(container_id)
            if container_id in self._containers_killed:
                killed = self._containers_killed[container_id]
                del self._containers_killed[container_id]

            stdout = ""
            stderr = ""
            result = "crash" if retval == -1 else None
            error_msg = None
            grade = None
            problems = {}
            custom = {}
            tests = {}
            archive = None
            state = ""

            if killed is not None:
                result = killed

            # If everything did well, continue to retrieve the status from the container
            if result is None:
                # Get logs back
                try:
                    return_value = await future_results

                    # Accepted types for return dict
                    accepted_types = {"stdout": str, "stderr": str, "result": str, "text": str, "grade": float,
                                      "problems": dict, "custom": dict, "tests": dict, "state": str, "archive": str}

                    keys_fct = {"problems": id_checker, "custom": id_checker, "tests": id_checker_tests}

                    # Check dict content
                    for key, item in return_value.items():
                        if not isinstance(item, accepted_types[key]):
                            raise Exception("Feedback file is badly formatted.")
                        elif accepted_types[key] == dict and key != "custom": #custom can contain anything:
                            for sub_key, sub_item in item.items():
                                if not keys_fct[key](sub_key) or isinstance(sub_item, dict):
                                    raise Exception("Feedback file is badly formatted.")

                    # Set output fields
                    stdout = return_value.get("stdout", "")
                    stderr = return_value.get("stderr", "")
                    result = return_value.get("result", "error")
                    error_msg = return_value.get("text", "")
                    grade = return_value.get("grade", None)
                    problems = return_value.get("problems", {})
                    custom = return_value.get("custom", {})
                    tests = return_value.get("tests", {})
                    state = return_value.get("state", "")
                    archive = return_value.get("archive", None)
                    if archive is not None:
                        archive = base64.b64decode(archive)
                except Exception as e:
                    self._logger.exception("Cannot get back output of container %s! (%s)", container_id, str(e))
                    result = "crash"
                    error_msg = 'The grader did not return a readable output : {}'.format(str(e))

            # Default values
            if error_msg is None:
                error_msg = ""
            if grade is None:
                if result == "success":
                    grade = 100.0
                else:
                    grade = 0.0

            # Remove container
            try:
                await self._docker.remove_container(container_id)
            except asyncio.CancelledError:
                raise
            except:
                pass

            # Delete folders
            try:
                await self._ashutil.rmtree(container_path)
            except PermissionError:
                self._logger.debug("Cannot remove old container path!")
                pass  # todo: run a docker container to force removal

            # Return!
            await self.send_job_result(message.job_id, result, error_msg, grade, problems, tests, custom, state, archive, stdout, stderr)

            # Do not forget to remove data from internal state
            del self._container_for_job[message.job_id]
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_job_closing")

    async def kill_job(self, message: BackendKillJob):
        """ Handles `kill` messages. Kill things. """
        try:
            if message.job_id in self._container_for_job:
                self._containers_killed[self._container_for_job[message.job_id]] = "killed"
                await self._docker.kill_container(self._container_for_job[message.job_id])
            else:
                self._logger.warning("Cannot kill container for job %s because it is not running", str(message.job_id))
        except asyncio.CancelledError:
            raise
        except:
            self._logger.exception("Exception in handle_kill_job")

    async def run(self):
        await self._init_clean()

        # Init Docker events watcher
        watcher_docker_event = self._create_safe_task(self._watch_docker_events())

        try:
            await super(DockerAgent, self).run()
        except:
            await self._end_clean()
            raise