Example #1
0
    def __init__(self,
                 task_directory,
                 course_factory,
                 task_factory,
                 ssh_manager_location,
                 tmp_dir="./agent_tmp"):
        """
        :param task_directory:
        :param course_factory:
        :param task_factory:
        :param ssh_manager_location: port or filename(unix socket) to bind to. If None, remote debugging is deactivated
        :param tmp_dir:
        :return:
        """
        from inginious.backend.agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir
        self.task_directory = task_directory
        self.course_factory = course_factory
        self.task_factory = task_factory

        # Delete tmp_dir, and recreate-it again
        try:
            rmtree(tmp_dir)
        except:
            pass

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        # Assert that the folders are *really* empty
        self._force_directory_empty(tmp_dir)

        if ssh_manager_location is not None:
            self.remote_ssh_manager = RemoteSSHManager(ssh_manager_location)
        else:
            self.remote_ssh_manager = None

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0

        # Dict that stores running container ids for each job id
        self._container_for_job = {}

        # Stores container id of killed containers
        self._killed_containers = set()
Example #2
0
    def __init__(self, task_directory, course_factory, task_factory, ssh_manager_location, tmp_dir="./agent_tmp"):
        """
        :param task_directory:
        :param course_factory:
        :param task_factory:
        :param ssh_manager_location: port or filename(unix socket) to bind to. If None, remote debugging is deactivated
        :param tmp_dir:
        :return:
        """
        from inginious.backend.agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir
        self.task_directory = task_directory
        self.course_factory = course_factory
        self.task_factory = task_factory

        # Delete tmp_dir, and recreate-it again
        try:
            rmtree(tmp_dir)
        except:
            pass

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        # Assert that the folders are *really* empty
        self._force_directory_empty(tmp_dir)

        if ssh_manager_location is not None:
            self.remote_ssh_manager = RemoteSSHManager(ssh_manager_location)
        else:
            self.remote_ssh_manager = None

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0

        # Dict that stores running container ids for each job id
        self._container_for_job = {}

        # Stores container id of killed containers
        self._killed_containers = set()
Example #3
0
class SimpleAgent(object):
    """
        A simple agent that can only handle one request at a time. It should not be used directly.
        The field self.image_aliases should be filled by subclasses
    """
    logger = logging.getLogger("agent")

    def __init__(self,
                 task_directory,
                 course_factory,
                 task_factory,
                 ssh_manager_location,
                 tmp_dir="./agent_tmp"):
        """
        :param task_directory:
        :param course_factory:
        :param task_factory:
        :param ssh_manager_location: port or filename(unix socket) to bind to. If None, remote debugging is deactivated
        :param tmp_dir:
        :return:
        """
        from inginious.backend.agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir
        self.task_directory = task_directory
        self.course_factory = course_factory
        self.task_factory = task_factory

        # Delete tmp_dir, and recreate-it again
        try:
            rmtree(tmp_dir)
        except:
            pass

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        # Assert that the folders are *really* empty
        self._force_directory_empty(tmp_dir)

        if ssh_manager_location is not None:
            self.remote_ssh_manager = RemoteSSHManager(ssh_manager_location)
        else:
            self.remote_ssh_manager = None

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0

        # Dict that stores running container ids for each job id
        self._container_for_job = {}

        # Stores container id of killed containers
        self._killed_containers = set()

    def _force_directory_empty(self, directory):
        """ Call Docker to empty directories that are still owned by old containers """
        docker_connection = docker.Client(**kwargs_from_env())
        response = docker_connection.create_container(
            "centos",
            volumes={'/todel': {}},
            network_disabled=True,
            command="/bin/bash -c 'rm -Rf /todel/*'")
        container_id = response["Id"]
        docker_connection.start(container_id,
                                binds={
                                    os.path.abspath(directory): {
                                        'ro': False,
                                        'bind': '/todel'
                                    }
                                })
        docker_connection.wait(container_id)
        thread.start_new_thread(docker_connection.remove_container,
                                (container_id, True, False, True))

    def _get_new_internal_job_id(self):
        """ Get a new internal job id """
        self._internal_job_count_lock.acquire()
        internal_job_id = self._internal_job_count
        self._internal_job_count += 1
        self._internal_job_count_lock.release()
        return internal_job_id

    def handle_get_batch_container_metadata(self,
                                            container_name,
                                            docker_connection=None):
        """
            Returns the arguments needed by a particular batch container and its description
            :returns: a tuple, in the form
                ("container title",
                 "container description in restructuredtext",
                 {"key":
                    {
                     "type:" "file", #or "text",
                     "path": "path/to/file/inside/input/dir", #not mandatory in file, by default "key"
                     "name": "name of the field", #not mandatory in file, default "key"
                     "description": "a short description of what this field is used for", #not mandatory, default ""
                     "custom_key1": "custom_value1",
                     ...
                    }
                 }
                )
        """

        try:
            docker_connection = docker_connection or docker.Client(
                **kwargs_from_env())
            data = docker_connection.inspect_image(
                container_name)["ContainerConfig"]["Labels"]
        except:
            self.logger.warning("Cannot inspect container %s", container_name)
            return None, None, None

        if not "org.inginious.batch" in data:
            self.logger.warning("Container %s is not a batch container",
                                container_name)
            return None, None, None

        title = data[
            "org.inginious.batch.title"] if "org.inginious.batch.title" in data else container_name
        description = data[
            "org.inginious.batch.description"] if "org.inginious.batch.description" in data else ""

        # Find valids keys
        args = {}
        for label in data:
            match = re.match(
                r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)$", label)
            if match and data[label] in ["file", "text"]:
                args[match.group(1)] = {"type": data[label]}

        # Parse additional metadata for the keys
        for label in data:
            match = re.match(
                r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)\.([a-zA-Z0-9\-_]+)$",
                label)
            if match and match.group(1) in args:
                if match.group(2) in ["name", "description"]:
                    args[match.group(1)][match.group(2)] = data[label]
                elif match.group(2) == "path":
                    if re.match(r"^[a-zA-Z\-_\./]+$",
                                data[label]) and ".." not in data[label]:
                        args[match.group(1)]["path"] = data[label]
                else:
                    args[match.group(1)][match.group(2)] = data[label]

        # Add all the unknown metadata
        for key in args:
            if "name" not in args[key]:
                args[key]["name"] = key
            if "path" not in args[key]:
                args[key]["path"] = key
            if "description" not in args[key]:
                args[key]["description"] = ""

        return (title, description, args)

    def handle_batch_job(self, job_id, container_name, input_data):
        """ Creates, executes and returns the results of a batch job.
            The return value of a batch job is always a compressed(gz) tar file.
        :param job_id: The distant job id
        :param container_name: The container image to launch
        :param input_data: a dict containing all the keys of get_batch_container_metadata(container_name)[2].
            The values associated are file-like objects for "file" types and  strings for "text" types.
        :return: a dict, containing either:
            - {"retval":0, "stdout": "...", "stderr":"...", "file":"..."}
                if everything went well. (where file is a tgz file containing the content of the /output folder from the container)
            - {"retval":"...", "stdout": "...", "stderr":"..."}
                if the container crashed (retval is an int != 0) (can also contain file, but not mandatory)
            - {"retval":-1, "stderr": "the error message"}
                if the container failed to start
        """
        self.logger.info("Received request for jobid %s (batch job)", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'retval': -1, "stderr": "Failed to connect to Docker"}

        batch_args = self.handle_get_batch_container_metadata(
            container_name, docker_connection)[2]
        if batch_args is None:
            return {
                'retval': -1,
                "stderr": "Inspecting the batch container image failed"
            }

        container_path = os.path.join(self.tmp_dir,
                                      str(internal_job_id))  # tmp_dir/id/
        input_path = os.path.join(container_path, 'input')  # tmp_dir/id/input/
        output_path = os.path.join(container_path,
                                   'output')  # tmp_dir/id/output/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(input_path)
        os.mkdir(output_path)
        os.chmod(container_path, 0777)
        os.chmod(input_path, 0777)
        os.chmod(output_path, 0777)

        try:
            if set(input_data.keys()) != set(batch_args.keys()):
                raise Exception("Invalid keys for inputdata")

            for key in batch_args:
                if batch_args[key]["type"] == "text":
                    if not isinstance(input_data[key], basestring):
                        raise Exception(
                            "Invalid value for inputdata: the value for key {} should be a string"
                            .format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]),
                         'w').write(input_data[key])
                elif batch_args[key]["type"] == "file":
                    if isinstance(input_data[key], basestring):
                        raise Exception(
                            "Invalid value for inputdata: the value for key {} should be a file object"
                            .format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]),
                         'w').write(input_data[key].read())
        except:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Invalid tgz for input'}

        # Run the container
        try:
            response = docker_connection.create_container(container_name,
                                                          volumes={
                                                              '/input': {},
                                                              '/output': {}
                                                          })
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id,
                                    binds={
                                        os.path.abspath(input_path): {
                                            'ro': False,
                                            'bind': '/input'
                                        },
                                        os.path.abspath(output_path): {
                                            'ro': False,
                                            'bind': '/output'
                                        }
                                    })
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Cannot start container'}

        # Wait for completion
        return_value = self._wait_for_container_completion(
            docker_connection, container_id, None)

        # If docker cannot do anything...
        if return_value == -1:
            self.logger.info("Container for job id %s crashed", job_id)
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Container crashed at startup'}

        # Get logs back
        stdout = ""
        stderr = ""
        try:
            stdout = str(
                docker_connection.logs(container_id, stdout=True,
                                       stderr=False))
            stderr = str(
                docker_connection.logs(container_id, stdout=True,
                                       stderr=False))
        except:
            self.logger.warning("Cannot get back stdout of container %s!",
                                container_id)
            rmtree(container_path)
            return {
                'retval': -1,
                "stderr": 'Cannot retrieve stdout/stderr from container'
            }

        # Tgz the files in /output
        try:
            tmpfile = tempfile.TemporaryFile()
            tar = tarfile.open(fileobj=tmpfile, mode='w:gz')
            tar.add(output_path, '/', True)
            tar.close()
            tmpfile.flush()
            tmpfile.seek(0)
        except:
            rmtree(container_path)
            return {
                'retval': -1,
                "stderr":
                'The agent was unable to archive the /output directory'
            }

        return {
            'retval': return_value,
            "stdout": stdout,
            "stderr": stderr,
            "file": tmpfile
        }

    def handle_job(self, job_id, course_id, task_id, inputdata, debug,
                   ssh_callback):
        """ Creates, executes and returns the results of a new job
        :param job_id: The distant job id
        :param course_id: The course id of the linked task
        :param task_id: The task id of the linked task
        :param inputdata: Input data, given by the student (dict)
        :param debug: Can be False (normal mode), True (outputs more data), or "ssh" (starts an ssh server in the container)
        :param ssh_callback: ssh callback function. Takes two parameters: (conn_id, private_key). Is only called if debug == "ssh".
        """
        self.logger.info("Received request for jobid %s", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Verify some arguments
        if debug == "ssh" and ssh_callback is None:
            return {
                'result':
                'crash',
                'text':
                'Agent error message: debug mode is set as ssh, but ssh_callback is None.'
            }
        if debug == "ssh" and self.remote_ssh_manager is None:
            return {
                'result': 'crash',
                'text': 'Remote debugging is not activated on this agent.'
            }

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'result': 'crash', 'text': 'Cannot connect to Docker'}

        # Get back the task data (for the limits)
        try:
            task = self.course_factory.get_task(course_id, task_id)
        except:
            self.logger.warning("Task %s/%s unavailable on this agent",
                                course_id, task_id)
            return {
                'result':
                'crash',
                'text':
                'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error '
                'persists, please contact your course administrator.'
            }

        limits = task.get_limits()

        mem_limit = limits.get("memory", 100)
        if mem_limit < 20:
            mem_limit = 20

        environment = task.get_environment()
        if environment not in self.image_aliases:
            self.logger.warning(
                "Task %s/%s ask for an unknown environment %s (not in aliases)",
                course_id, task_id, environment)
            return {
                'result':
                'crash',
                'text':
                'Unknown container. Please contact your course administrator.'
            }
        environment = self.image_aliases[environment]

        # Remove possibly existing older folder and creates the new ones
        container_path = os.path.join(self.tmp_dir,
                                      str(internal_job_id))  # tmp_dir/id/
        task_path = os.path.join(container_path, 'task')  # tmp_dir/id/task/
        sockets_path = os.path.join(container_path,
                                    'sockets')  # tmp_dir/id/socket/
        student_path = os.path.join(task_path,
                                    'student')  # tmp_dir/id/task/student/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(sockets_path)
        os.chmod(container_path, 0777)
        os.chmod(sockets_path, 0777)

        copytree(
            os.path.join(self.task_directory, task.get_course_id(),
                         task.get_id()), task_path)
        os.chmod(task_path, 0777)

        if not os.path.exists(student_path):
            os.mkdir(student_path)
            os.chmod(student_path, 0777)

        # Run the container
        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                volumes={
                    '/task': {},
                    '/sockets': {}
                },
                network_disabled=not (task.allow_network_access_grading()
                                      or debug == "ssh"),
                host_config=docker_connection.create_host_config(
                    mem_limit=str(mem_limit) + "M",
                    memswap_limit=str(mem_limit) + "M",
                    mem_swappiness=0,
                    oom_kill_disable=True,
                    network_mode=("bridge" if
                                  (task.allow_network_access_grading()
                                   or debug == "ssh") else 'none'),
                    binds={
                        os.path.abspath(task_path): {
                            'ro': False,
                            'bind': '/task'
                        },
                        os.path.abspath(sockets_path): {
                            'ro': False,
                            'bind': '/sockets'
                        }
                    }))
            container_id = response["Id"]
            self._container_for_job[job_id] = container_id

            # Start the RPyC server associated with this container
            container_set = set()
            student_container_management_service = self._get_agent_student_container_service(
                container_id, container_set, student_path,
                task.get_environment(), limits.get("time", 30), mem_limit)

            # Small workaround for error "AF_UNIX path too long" when the agent is launched inside a container. Resolve all symlinks to reduce the
            # path length.
            smaller_path_to_socket = os.path.realpath(
                os.path.join(sockets_path, 'INGInious.sock'))

            student_container_management = UnixSocketServer(
                student_container_management_service,
                socket_path=smaller_path_to_socket,
                protocol_config={
                    "allow_public_attrs": True,
                    'allow_pickle': True
                })

            student_container_management_thread = threading.Thread(
                target=student_container_management.start)
            student_container_management_thread.daemon = True
            student_container_management_thread.start()

            # Start the container
            docker_connection.start(container_id)

            # Send the input data
            container_input = {"input": inputdata, "limits": limits}
            if debug:
                container_input["debug"] = debug
            self.logger.debug("%s", json.dumps(container_input))
            docker_connection.attach_socket(container_id, {
                'stdin': 1,
                'stream': 1
            }).send(json.dumps(container_input) + "\n")
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)

            try:
                del self._container_for_job[job_id]
            except:
                pass

            return {'result': 'crash', 'text': 'Cannot start container'}

        # Ask the "cgroup" thread to verify the timeout/memory limit
        time_limit = limits.get("time", 30)
        hard_time_limit = limits.get('hard_time', time_limit * 3)
        if debug == "ssh":  # allow 30 minutes of real time.
            time_limit = 30 * 60
            hard_time_limit = 30 * 60
        self._timeout_watcher.add_container_timeout(container_id, time_limit,
                                                    hard_time_limit)
        self._memory_watcher.add_container_memory_limit(
            container_id, mem_limit)

        # If ssh mode is activated, get the ssh key
        if debug == "ssh":
            self._handle_container_ssh_start(docker_connection, container_id,
                                             job_id, ssh_callback)

        # Wait for completion
        error_occured = False
        if self._wait_for_container_completion(docker_connection, container_id,
                                               int(hard_time_limit *
                                                   1.2)) == -1:
            self.logger.info("Container for job id %s crashed", job_id)
            error_occured = True

        if debug == "ssh":
            self._handle_container_ssh_close(job_id)

        del self._container_for_job[job_id]

        # Verify that everything went well
        error_killed = self._container_was_killed(container_id)
        error_timeout = self._timeout_watcher.container_had_error(container_id)
        error_memory = self._memory_watcher.container_had_error(container_id)
        if error_killed:
            result = {"result": "killed"}
        elif error_timeout:
            result = {"result": "timeout"}
        elif error_memory:
            result = {"result": "overflow"}
        elif error_occured:
            result = {
                "result": "crash",
                "text": "An unknown error occurred while running the container"
            }
        else:
            # Get logs back
            try:
                stdout = str(
                    docker_connection.logs(container_id,
                                           stdout=True,
                                           stderr=False))
                self.logger.debug(stdout)
                if debug == "ssh":  # skip the first line of the output, that contained the ssh key
                    stdout = "\n".join(stdout.split("\n")[1:])
                self.logger.debug(stdout)
                result = json.loads(stdout)
            except Exception as e:
                self.logger.warning(
                    "Cannot get back stdout of container %s! (%s)",
                    container_id, str(e))
                result = {
                    'result': 'crash',
                    'text': 'The grader did not return a readable output'
                }

        # Close RPyC server
        student_container_management.close()

        # Remove container
        thread.start_new_thread(docker_connection.remove_container,
                                (container_id, True, False, True))

        # Remove subcontainers
        for i in container_set:
            # Also deletes them from the timeout/memory watchers
            self._timeout_watcher.container_had_error(container_id)
            self._memory_watcher.container_had_error(container_id)
            thread.start_new_thread(docker_connection.remove_container,
                                    (i, True, False, True))

        # Delete folders
        rmtree(container_path)

        # Return!
        return result

    def _create_new_student_container(self, container_name, working_dir,
                                      command, memory_limit, time_limit,
                                      hard_time_limit, share_network,
                                      parent_container_id, container_set,
                                      student_path):
        self.logger.debug(
            "Starting new student container... %s %s %s %s %s %s",
            container_name, working_dir, command, memory_limit, time_limit,
            hard_time_limit)
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None, None, "Cannot connect to Docker!"

        mem_limit = memory_limit or 100
        if mem_limit < 20:
            mem_limit = 20

        if container_name not in self.image_aliases:
            self.logger.info("Unknown environment %s (not in aliases)",
                             container_name)
            return None, None, "Unknown environment {} (not in aliases)".format(
                container_name)
        environment = self.image_aliases[container_name]

        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                network_disabled=(not share_network),
                volumes={'/task/student': {}},
                command=command,
                working_dir=working_dir,
                user="******",
                host_config=docker_connection.create_host_config(
                    mem_limit=str(mem_limit) + "M",
                    memswap_limit=str(mem_limit) + "M",
                    mem_swappiness=0,
                    oom_kill_disable=True,
                    network_mode=('none' if not share_network else
                                  ('container:' + parent_container_id)),
                    binds={
                        os.path.abspath(student_path): {
                            'ro': False,
                            'bind': '/task/student'
                        }
                    }))
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id)

            stdout_err = docker_connection.attach_socket(
                container_id, {
                    'stdin': 0,
                    'stdout': 1,
                    'stderr': 1,
                    'stream': 1,
                    'logs': 1
                })
        except Exception as e:
            self.logger.warning("Cannot start container! %s", e)
            return None, None, "Cannot start container! {}".format(e)

        container_set.add(container_id)
        # Ask the "cgroup" thread to verify the timeout/memory limit
        self._timeout_watcher.add_container_timeout(
            container_id, time_limit, min(time_limit * 4, hard_time_limit))
        self._memory_watcher.add_container_memory_limit(
            container_id, mem_limit)

        self.logger.info("New student container started")
        return container_id, stdout_err, None

    def _student_container_signal(self, container_id, signalnum):
        self.logger.info("Sending signal %s to student container",
                         str(signalnum))
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return False

        docker_connection.kill(container_id, signalnum)
        return True

    def _student_container_get_stdin(self, container_id):
        self.logger.info("Getting stdin of student container")
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None

        stdin = docker_connection.attach_socket(container_id, {
            'stdin': 1,
            'stderr': 0,
            'stdout': 0,
            'stream': 1
        })
        self.logger.info("Returning stdin of student container")
        return stdin

    def _student_container_close(self, container_id, container_set):
        self.logger.info("Closing student container")

        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return 254

        # Wait for completion
        return_value = self._wait_for_container_completion(
            docker_connection, container_id, None)
        if return_value == -1:
            return_value = 254

        # Verify that everything went well
        if self._timeout_watcher.container_had_error(container_id):
            return_value = 253
        if self._memory_watcher.container_had_error(container_id):
            return_value = 252

        # Remove container
        thread.start_new_thread(docker_connection.remove_container,
                                (container_id, True, False, True))
        container_set.remove(container_id)

        # Return!
        return return_value

    def _get_agent_student_container_service(self, parent_container_id,
                                             container_set, student_path,
                                             default_container, default_time,
                                             default_memory):
        create_new_student_container = self._create_new_student_container
        student_container_signal = self._student_container_signal
        student_container_get_stdin = self._student_container_get_stdin
        student_container_close = self._student_container_close

        class StudentContainerManagementService(rpyc.Service):
            def exposed_run(self,
                            container_name,
                            working_dir,
                            command,
                            memory_limit=0,
                            time_limit=0,
                            hard_time_limit=0,
                            share_network=False):
                if container_name == "":
                    container_name = default_container
                if memory_limit == 0:
                    memory_limit = default_memory
                if time_limit == 0:
                    time_limit = default_time
                if hard_time_limit == 0:
                    hard_time_limit = 3 * time_limit
                return create_new_student_container(str(container_name),
                                                    str(working_dir),
                                                    str(command),
                                                    int(memory_limit),
                                                    int(time_limit),
                                                    int(hard_time_limit),
                                                    bool(share_network),
                                                    parent_container_id,
                                                    container_set,
                                                    student_path)

            def exposed_signal(self, container_id, signalnum):
                if container_id in container_set:
                    return student_container_signal(str(container_id),
                                                    int(signalnum))
                return None

            def exposed_stdin(self, container_id):
                if container_id in container_set:
                    return student_container_get_stdin(str(container_id))
                return None

            def exposed_close(self, container_id):
                if container_id in container_set:
                    return student_container_close(str(container_id),
                                                   container_set)
                return None

        return StudentContainerManagementService

    def _wait_for_container_completion(self, docker_connection, container_id,
                                       max_time):
        """ Wait for container completion. Returns the return value of the command or -1 if an error happened. """
        try:
            return_value = docker_connection.wait(container_id, max_time)
        except:
            return_value = -1
        self.logger.debug("Container id %s ended with %i", container_id,
                          return_value)
        return return_value

    def _handle_container_ssh_start(self, docker_connection, container_id,
                                    job_id, ssh_callback):
        """ Handle the creation of the distant SSH server """
        if self.remote_ssh_manager is None:
            raise Exception("Remote debugging is not activated")

        ssh_key = None
        for attempt in range(0, 30):
            try:
                stdout = str(
                    docker_connection.logs(container_id,
                                           stdout=True,
                                           stderr=False))
                stdout = stdout.split("\n")[0]
                stdout = json.loads(stdout)
                ssh_key = stdout["ssh_key"]
                self.logger.debug('Got SSH key for job %s', job_id)
                break
            except Exception as e:
                self.logger.debug("Cannot get SSH for job %s: %s. Retrying...",
                                  job_id, str(e))
            time.sleep(1)

        if ssh_key is None:
            return

        try:
            ip = docker_connection.inspect_container(
                container_id)["NetworkSettings"]["IPAddress"]
        except:
            self.logger.debug(
                "Cannot inspect container to find IP, for job %s", job_id)
            return

        self.logger.debug("Got IP for SSH connection of job id %s: %s", job_id,
                          ip)
        self.remote_ssh_manager.add_open_connection(job_id, ip, 22)
        self.logger.debug(
            "Start key for the inginious-remote-debug command:\n%s\n%s",
            job_id, ssh_key)

        if ssh_callback is not None:
            try:
                ssh_callback(job_id, ssh_key)
            except:
                self.logger.warning("Cannot call ssh_callback for job id %s",
                                    job_id)

    def _handle_container_ssh_close(self, job_id):
        """ Marks as closed a distant SSH server """
        if self.remote_ssh_manager is None:
            raise Exception("Remote debugging is not activated")

        self.remote_ssh_manager.del_connection(job_id)

    def _container_was_killed(self, container_id):
        """ Returns True (/False) if the container was (/not) killed by the remote job manager. Can only be called ONCE for each container_id """
        if container_id in self._killed_containers:
            self._killed_containers.remove(container_id)
            return True
        return False

    def kill_job(self, job_id):
        """ Kill a running job """
        container_id = self._container_for_job.get(job_id)
        if container_id is None:
            self.logger.error("Invalid job_id submitted to kill_job")
            return False

        # Add it now, even if the container is not really killed.
        self._killed_containers.add(container_id)

        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.error("kill_job cannot connect to Docker!")
            return False

        try:
            docker_connection.kill(container_id)
            return True
        except Exception as e:
            self.logger.info("Cannot kill container %s: %s", container_id,
                             str(e))
            return False
Example #4
0
class SimpleAgent(object):
    """
        A simple agent that can only handle one request at a time. It should not be used directly.
        The field self.image_aliases should be filled by subclasses
    """
    logger = logging.getLogger("agent")

    def __init__(self, task_directory, course_factory, task_factory, ssh_manager_location, tmp_dir="./agent_tmp"):
        """
        :param task_directory:
        :param course_factory:
        :param task_factory:
        :param ssh_manager_location: port or filename(unix socket) to bind to. If None, remote debugging is deactivated
        :param tmp_dir:
        :return:
        """
        from inginious.backend.agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir
        self.task_directory = task_directory
        self.course_factory = course_factory
        self.task_factory = task_factory

        # Delete tmp_dir, and recreate-it again
        try:
            rmtree(tmp_dir)
        except:
            pass

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        # Assert that the folders are *really* empty
        self._force_directory_empty(tmp_dir)

        if ssh_manager_location is not None:
            self.remote_ssh_manager = RemoteSSHManager(ssh_manager_location)
        else:
            self.remote_ssh_manager = None

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0

        # Dict that stores running container ids for each job id
        self._container_for_job = {}

        # Stores container id of killed containers
        self._killed_containers = set()

    def _force_directory_empty(self, directory):
        """ Call Docker to empty directories that are still owned by old containers """
        docker_connection = docker.Client(**kwargs_from_env())
        response = docker_connection.create_container(
            "centos",
            volumes={'/todel': {}},
            network_disabled=True,
            command="/bin/bash -c 'rm -Rf /todel/*'"
        )
        container_id = response["Id"]
        docker_connection.start(container_id, binds={os.path.abspath(directory): {'ro': False, 'bind': '/todel'}})
        docker_connection.wait(container_id)
        thread.start_new_thread(docker_connection.remove_container, (container_id, True, False, True))

    def _get_new_internal_job_id(self):
        """ Get a new internal job id """
        self._internal_job_count_lock.acquire()
        internal_job_id = self._internal_job_count
        self._internal_job_count += 1
        self._internal_job_count_lock.release()
        return internal_job_id

    def handle_get_batch_container_metadata(self, container_name, docker_connection=None):
        """
            Returns the arguments needed by a particular batch container and its description
            :returns: a tuple, in the form
                ("container title",
                 "container description in restructuredtext",
                 {"key":
                    {
                     "type:" "file", #or "text",
                     "path": "path/to/file/inside/input/dir", #not mandatory in file, by default "key"
                     "name": "name of the field", #not mandatory in file, default "key"
                     "description": "a short description of what this field is used for", #not mandatory, default ""
                     "custom_key1": "custom_value1",
                     ...
                    }
                 }
                )
        """

        try:
            docker_connection = docker_connection or docker.Client(**kwargs_from_env())
            data = docker_connection.inspect_image(container_name)["ContainerConfig"]["Labels"]
        except:
            self.logger.warning("Cannot inspect container %s", container_name)
            return None, None, None

        if not "org.inginious.batch" in data:
            self.logger.warning("Container %s is not a batch container", container_name)
            return None, None, None

        title = data["org.inginious.batch.title"] if "org.inginious.batch.title" in data else container_name
        description = data["org.inginious.batch.description"] if "org.inginious.batch.description" in data else ""

        # Find valids keys
        args = {}
        for label in data:
            match = re.match(r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)$", label)
            if match and data[label] in ["file", "text"]:
                args[match.group(1)] = {"type": data[label]}

        # Parse additional metadata for the keys
        for label in data:
            match = re.match(r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)\.([a-zA-Z0-9\-_]+)$", label)
            if match and match.group(1) in args:
                if match.group(2) in ["name", "description"]:
                    args[match.group(1)][match.group(2)] = data[label]
                elif match.group(2) == "path":
                    if re.match(r"^[a-zA-Z\-_\./]+$", data[label]) and ".." not in data[label]:
                        args[match.group(1)]["path"] = data[label]
                else:
                    args[match.group(1)][match.group(2)] = data[label]

        # Add all the unknown metadata
        for key in args:
            if "name" not in args[key]:
                args[key]["name"] = key
            if "path" not in args[key]:
                args[key]["path"] = key
            if "description" not in args[key]:
                args[key]["description"] = ""

        return (title, description, args)

    def handle_batch_job(self, job_id, container_name, input_data):
        """ Creates, executes and returns the results of a batch job.
            The return value of a batch job is always a compressed(gz) tar file.
        :param job_id: The distant job id
        :param container_name: The container image to launch
        :param input_data: a dict containing all the keys of get_batch_container_metadata(container_name)[2].
            The values associated are file-like objects for "file" types and  strings for "text" types.
        :return: a dict, containing either:
            - {"retval":0, "stdout": "...", "stderr":"...", "file":"..."}
                if everything went well. (where file is a tgz file containing the content of the /output folder from the container)
            - {"retval":"...", "stdout": "...", "stderr":"..."}
                if the container crashed (retval is an int != 0) (can also contain file, but not mandatory)
            - {"retval":-1, "stderr": "the error message"}
                if the container failed to start
        """
        self.logger.info("Received request for jobid %s (batch job)", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'retval': -1, "stderr": "Failed to connect to Docker"}

        batch_args = self.handle_get_batch_container_metadata(container_name, docker_connection)[2]
        if batch_args is None:
            return {'retval': -1, "stderr": "Inspecting the batch container image failed"}

        container_path = os.path.join(self.tmp_dir, str(internal_job_id))  # tmp_dir/id/
        input_path = os.path.join(container_path, 'input')  # tmp_dir/id/input/
        output_path = os.path.join(container_path, 'output')  # tmp_dir/id/output/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(input_path)
        os.mkdir(output_path)
        os.chmod(container_path, 0777)
        os.chmod(input_path, 0777)
        os.chmod(output_path, 0777)

        try:
            if set(input_data.keys()) != set(batch_args.keys()):
                raise Exception("Invalid keys for inputdata")

            for key in batch_args:
                if batch_args[key]["type"] == "text":
                    if not isinstance(input_data[key], basestring):
                        raise Exception("Invalid value for inputdata: the value for key {} should be a string".format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]), 'w').write(input_data[key])
                elif batch_args[key]["type"] == "file":
                    if isinstance(input_data[key], basestring):
                        raise Exception("Invalid value for inputdata: the value for key {} should be a file object".format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]), 'w').write(input_data[key].read())
        except:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Invalid tgz for input'}

        # Run the container
        try:
            response = docker_connection.create_container(
                container_name,
                volumes={'/input': {}, '/output': {}}
            )
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(input_path): {'ro': False, 'bind': '/input'},
                                           os.path.abspath(output_path): {'ro': False, 'bind': '/output'}})
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Cannot start container'}

        # Wait for completion
        return_value = self._wait_for_container_completion(docker_connection, container_id, None)

        # If docker cannot do anything...
        if return_value == -1:
            self.logger.info("Container for job id %s crashed", job_id)
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Container crashed at startup'}

        # Get logs back
        stdout = ""
        stderr = ""
        try:
            stdout = str(docker_connection.logs(container_id, stdout=True, stderr=False))
            stderr = str(docker_connection.logs(container_id, stdout=True, stderr=False))
        except:
            self.logger.warning("Cannot get back stdout of container %s!", container_id)
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Cannot retrieve stdout/stderr from container'}

        # Tgz the files in /output
        try:
            tmpfile = tempfile.TemporaryFile()
            tar = tarfile.open(fileobj=tmpfile, mode='w:gz')
            tar.add(output_path, '/', True)
            tar.close()
            tmpfile.flush()
            tmpfile.seek(0)
        except:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'The agent was unable to archive the /output directory'}

        return {'retval': return_value, "stdout": stdout, "stderr": stderr, "file": tmpfile}

    def handle_job(self, job_id, course_id, task_id, inputdata, debug, ssh_callback):
        """ Creates, executes and returns the results of a new job
        :param job_id: The distant job id
        :param course_id: The course id of the linked task
        :param task_id: The task id of the linked task
        :param inputdata: Input data, given by the student (dict)
        :param debug: Can be False (normal mode), True (outputs more data), or "ssh" (starts an ssh server in the container)
        :param ssh_callback: ssh callback function. Takes two parameters: (conn_id, private_key). Is only called if debug == "ssh".
        """
        self.logger.info("Received request for jobid %s", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Verify some arguments
        if debug == "ssh" and ssh_callback is None:
            return {'result': 'crash', 'text': 'Agent error message: debug mode is set as ssh, but ssh_callback is None.'}
        if debug == "ssh" and self.remote_ssh_manager is None:
            return {'result': 'crash', 'text': 'Remote debugging is not activated on this agent.'}

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'result': 'crash', 'text': 'Cannot connect to Docker'}

        # Get back the task data (for the limits)
        try:
            task = self.course_factory.get_task(course_id, task_id)
        except:
            self.logger.warning("Task %s/%s unavailable on this agent", course_id, task_id)
            return {'result': 'crash', 'text': 'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error '
                                               'persists, please contact your course administrator.'}

        limits = task.get_limits()

        mem_limit = limits.get("memory", 100)
        if mem_limit < 20:
            mem_limit = 20

        environment = task.get_environment()
        if environment not in self.image_aliases:
            self.logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id, environment)
            return {'result': 'crash', 'text': 'Unknown container. Please contact your course administrator.'}
        environment = self.image_aliases[environment]

        # Remove possibly existing older folder and creates the new ones
        container_path = os.path.join(self.tmp_dir, str(internal_job_id))  # tmp_dir/id/
        task_path = os.path.join(container_path, 'task')  # tmp_dir/id/task/
        sockets_path = os.path.join(container_path, 'sockets')  # tmp_dir/id/socket/
        student_path = os.path.join(task_path, 'student')  # tmp_dir/id/task/student/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(sockets_path)
        os.chmod(container_path, 0777)
        os.chmod(sockets_path, 0777)

        copytree(os.path.join(self.task_directory, task.get_course_id(), task.get_id()), task_path)
        os.chmod(task_path, 0777)

        if not os.path.exists(student_path):
            os.mkdir(student_path)
            os.chmod(student_path, 0777)

        # Run the container
        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                volumes={'/task': {}, '/sockets': {}},
                network_disabled=not (task.allow_network_access_grading() or debug == "ssh")
            )
            container_id = response["Id"]
            self._container_for_job[job_id] = container_id

            # Start the RPyC server associated with this container
            container_set = set()
            student_container_management_service = self._get_agent_student_container_service(
                container_id,
                container_set,
                student_path,
                task.get_environment(),
                limits.get("time", 30),
                mem_limit)

            # Small workaround for error "AF_UNIX path too long" when the agent is launched inside a container. Resolve all symlinks to reduce the
            # path length.
            smaller_path_to_socket = os.path.realpath(os.path.join(sockets_path, 'INGInious.sock'))

            student_container_management = UnixSocketServer(
                student_container_management_service,
                socket_path=smaller_path_to_socket,
                protocol_config={"allow_public_attrs": True, 'allow_pickle': True})

            student_container_management_thread = threading.Thread(target=student_container_management.start)
            student_container_management_thread.daemon = True
            student_container_management_thread.start()

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(task_path): {'ro': False, 'bind': '/task'},
                                           os.path.abspath(sockets_path): {'ro': False, 'bind': '/sockets'}},
                                    mem_limit=mem_limit * 1024 * 1024,
                                    memswap_limit=mem_limit * 1024 * 1024,  # disable swap
                                    oom_kill_disable=True,
                                    network_mode=("bridge" if (task.allow_network_access_grading() or debug == "ssh") else None))

            # Send the input data
            container_input = {"input": inputdata, "limits": limits}
            if debug:
                container_input["debug"] = debug
            self.logger.debug("%s", json.dumps(container_input))
            docker_connection.attach_socket(container_id, {'stdin': 1, 'stream': 1}).send(json.dumps(container_input) + "\n")
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)

            try:
                del self._container_for_job[job_id]
            except:
                pass

            return {'result': 'crash', 'text': 'Cannot start container'}

        # Ask the "cgroup" thread to verify the timeout/memory limit
        time_limit = limits.get("time", 30)
        hard_time_limit = limits.get('hard_time', time_limit * 3)
        if debug == "ssh":  # allow 30 minutes of real time.
            time_limit = 30 * 60
            hard_time_limit = 30 * 60
        self._timeout_watcher.add_container_timeout(container_id, time_limit, hard_time_limit)
        self._memory_watcher.add_container_memory_limit(container_id, mem_limit)

        # If ssh mode is activated, get the ssh key
        if debug == "ssh":
            self._handle_container_ssh_start(docker_connection, container_id, job_id, ssh_callback)

        # Wait for completion
        error_occured = False
        if self._wait_for_container_completion(docker_connection, container_id, int(hard_time_limit * 1.2)) == -1:
            self.logger.info("Container for job id %s crashed", job_id)
            error_occured = True

        if debug == "ssh":
            self._handle_container_ssh_close(job_id)

        del self._container_for_job[job_id]

        # Verify that everything went well
        error_killed = self._container_was_killed(container_id)
        error_timeout = self._timeout_watcher.container_had_error(container_id)
        error_memory = self._memory_watcher.container_had_error(container_id)
        if error_killed:
            result = {"result": "killed"}
        elif error_timeout:
            result = {"result": "timeout"}
        elif error_memory:
            result = {"result": "overflow"}
        elif error_occured:
            result = {"result": "crash", "text": "An unknown error occurred while running the container"}
        else:
            # Get logs back
            try:
                stdout = str(docker_connection.logs(container_id, stdout=True, stderr=False))
                self.logger.debug(stdout)
                if debug == "ssh":  # skip the first line of the output, that contained the ssh key
                    stdout = "\n".join(stdout.split("\n")[1:])
                self.logger.debug(stdout)
                result = json.loads(stdout)
            except Exception as e:
                self.logger.warning("Cannot get back stdout of container %s! (%s)", container_id, str(e))
                result = {'result': 'crash', 'text': 'The grader did not return a readable output'}

        # Close RPyC server
        student_container_management.close()

        # Remove container
        thread.start_new_thread(docker_connection.remove_container, (container_id, True, False, True))

        # Remove subcontainers
        for i in container_set:
            # Also deletes them from the timeout/memory watchers
            self._timeout_watcher.container_had_error(container_id)
            self._memory_watcher.container_had_error(container_id)
            thread.start_new_thread(docker_connection.remove_container, (i, True, False, True))

        # Delete folders
        rmtree(container_path)

        # Return!
        return result

    def _create_new_student_container(self, container_name, working_dir, command, memory_limit, time_limit, hard_time_limit, share_network,
                                      parent_container_id, container_set, student_path):
        self.logger.debug("Starting new student container... %s %s %s %s %s %s", container_name, working_dir, command, memory_limit, time_limit,
                          hard_time_limit)
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None, None, "Cannot connect to Docker!"

        mem_limit = memory_limit or 100
        if mem_limit < 20:
            mem_limit = 20

        if container_name not in self.image_aliases:
            self.logger.info("Unknown environment %s (not in aliases)", container_name)
            return None, None, "Unknown environment {} (not in aliases)".format(container_name)
        environment = self.image_aliases[container_name]

        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                network_disabled=(not share_network),
                volumes={'/task/student': {}},
                command=command,
                working_dir=working_dir,
                user="******"
            )
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(student_path): {'ro': False, 'bind': '/task/student'}},
                                    mem_limit=mem_limit * 1024 * 1024,  # add 10 mo of bonus, as we check the memory in the "cgroup" thread
                                    memswap_limit=mem_limit * 1024 * 1024,  # disable swap
                                    oom_kill_disable=True,
                                    network_mode=(None if not share_network else ('container:'+parent_container_id))
                                    )

            stdout_err = docker_connection.attach_socket(container_id, {'stdin': 0, 'stdout': 1, 'stderr': 1, 'stream': 1, 'logs': 1})
        except Exception as e:
            self.logger.warning("Cannot start container! %s", e)
            return None, None, "Cannot start container! {}".format(e)

        container_set.add(container_id)
        # Ask the "cgroup" thread to verify the timeout/memory limit
        self._timeout_watcher.add_container_timeout(container_id, time_limit, min(time_limit * 4, hard_time_limit))
        self._memory_watcher.add_container_memory_limit(container_id, mem_limit)

        self.logger.info("New student container started")
        return container_id, stdout_err, None

    def _student_container_signal(self, container_id, signalnum):
        self.logger.info("Sending signal %s to student container", str(signalnum))
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return False

        docker_connection.kill(container_id, signalnum)
        return True

    def _student_container_get_stdin(self, container_id):
        self.logger.info("Getting stdin of student container")
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None

        stdin = docker_connection.attach_socket(container_id, {'stdin': 1, 'stderr': 0, 'stdout': 0, 'stream': 1})
        self.logger.info("Returning stdin of student container")
        return stdin

    def _student_container_close(self, container_id, container_set):
        self.logger.info("Closing student container")

        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return 254

        # Wait for completion
        return_value = self._wait_for_container_completion(docker_connection, container_id, None)
        if return_value == -1:
            return_value = 254

        # Verify that everything went well
        if self._timeout_watcher.container_had_error(container_id):
            return_value = 253
        if self._memory_watcher.container_had_error(container_id):
            return_value = 252

        # Remove container
        thread.start_new_thread(docker_connection.remove_container, (container_id, True, False, True))
        container_set.remove(container_id)

        # Return!
        return return_value

    def _get_agent_student_container_service(self, parent_container_id, container_set, student_path, default_container, default_time, default_memory):
        create_new_student_container = self._create_new_student_container
        student_container_signal = self._student_container_signal
        student_container_get_stdin = self._student_container_get_stdin
        student_container_close = self._student_container_close

        class StudentContainerManagementService(rpyc.Service):

            def exposed_run(self, container_name, working_dir, command, memory_limit=0, time_limit=0, hard_time_limit=0, share_network=False):
                if container_name == "":
                    container_name = default_container
                if memory_limit == 0:
                    memory_limit = default_memory
                if time_limit == 0:
                    time_limit = default_time
                if hard_time_limit == 0:
                    hard_time_limit = 3 * time_limit
                return create_new_student_container(str(container_name), str(working_dir), str(command), int(memory_limit), int(time_limit),
                                                    int(hard_time_limit), bool(share_network), parent_container_id, container_set, student_path)

            def exposed_signal(self, container_id, signalnum):
                if container_id in container_set:
                    return student_container_signal(str(container_id), int(signalnum))
                return None

            def exposed_stdin(self, container_id):
                if container_id in container_set:
                    return student_container_get_stdin(str(container_id))
                return None

            def exposed_close(self, container_id):
                if container_id in container_set:
                    return student_container_close(str(container_id), container_set)
                return None

        return StudentContainerManagementService

    def _wait_for_container_completion(self, docker_connection, container_id, max_time):
        """ Wait for container completion. Returns the return value of the command or -1 if an error happened. """
        try:
            return_value = docker_connection.wait(container_id, max_time)
        except:
            return_value = -1
        self.logger.debug("Container id %s ended with %i", container_id, return_value)
        return return_value

    def _handle_container_ssh_start(self, docker_connection, container_id, job_id, ssh_callback):
        """ Handle the creation of the distant SSH server """
        if self.remote_ssh_manager is None:
            raise Exception("Remote debugging is not activated")

        ssh_key = None
        for attempt in range(0, 30):
            try:
                stdout = str(docker_connection.logs(container_id, stdout=True, stderr=False))
                stdout = stdout.split("\n")[0]
                stdout = json.loads(stdout)
                ssh_key = stdout["ssh_key"]
                self.logger.debug('Got SSH key for job %s', job_id)
                break
            except Exception as e:
                self.logger.debug("Cannot get SSH for job %s: %s. Retrying...", job_id, str(e))
            time.sleep(1)

        if ssh_key is None:
            return

        try:
            ip = docker_connection.inspect_container(container_id)["NetworkSettings"]["IPAddress"]
        except:
            self.logger.debug("Cannot inspect container to find IP, for job %s", job_id)
            return

        self.logger.debug("Got IP for SSH connection of job id %s: %s", job_id, ip)
        self.remote_ssh_manager.add_open_connection(job_id, ip, 22)
        self.logger.debug("Start key for the inginious-remote-debug command:\n%s\n%s", job_id, ssh_key)

        if ssh_callback is not None:
            try:
                ssh_callback(job_id, ssh_key)
            except:
                self.logger.warning("Cannot call ssh_callback for job id %s", job_id)

    def _handle_container_ssh_close(self, job_id):
        """ Marks as closed a distant SSH server """
        if self.remote_ssh_manager is None:
            raise Exception("Remote debugging is not activated")

        self.remote_ssh_manager.del_connection(job_id)

    def _container_was_killed(self, container_id):
        """ Returns True (/False) if the container was (/not) killed by the remote job manager. Can only be called ONCE for each container_id """
        if container_id in self._killed_containers:
            self._killed_containers.remove(container_id)
            return True
        return False

    def kill_job(self, job_id):
        """ Kill a running job """
        container_id = self._container_for_job.get(job_id)
        if container_id is None:
            self.logger.error("Invalid job_id submitted to kill_job")
            return False

        # Add it now, even if the container is not really killed.
        self._killed_containers.add(container_id)

        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.error("kill_job cannot connect to Docker!")
            return False

        try:
            docker_connection.kill(container_id)
            return True
        except Exception as e:
            self.logger.info("Cannot kill container %s: %s", container_id, str(e))
            return False