Ejemplo n.º 1
0
    def __init__(self, tmp_dir="./agent_tmp"):
        from backend_agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0
Ejemplo n.º 2
0
class SimpleAgent(object):
    """
        A simple agent that can only handle one request at a time. It should not be used directly.
        The field self.image_aliases should be filled by subclasses
    """
    logger = logging.getLogger("agent")

    def __init__(self, tmp_dir="./agent_tmp"):
        from backend_agent._cgroup_helper import CGroupTimeoutWatcher, CGroupMemoryWatcher

        self.logger.info("Starting agent")
        self.image_aliases = []
        self.tmp_dir = tmp_dir

        try:
            os.mkdir(tmp_dir)
        except OSError:
            pass

        self.logger.debug("Start cgroup helper")
        self._timeout_watcher = CGroupTimeoutWatcher()
        self._memory_watcher = CGroupMemoryWatcher()
        self._timeout_watcher.start()
        self._memory_watcher.start()

        # Init the internal job count, used to name the directories
        self._internal_job_count_lock = threading.Lock()
        self._internal_job_count = 0

    def _get_new_internal_job_id(self):
        """ Get a new internal job id """
        self._internal_job_count_lock.acquire()
        internal_job_id = self._internal_job_count
        self._internal_job_count += 1
        self._internal_job_count_lock.release()
        return internal_job_id

    def handle_get_batch_container_metadata(self, container_name, docker_connection=None):
        """
            Returns the arguments needed by a particular batch container and its description
            :returns: a tuple, in the form
                ("container title",
                 "container description in restructuredtext",
                 {"key":
                    {
                     "type:" "file", #or "text",
                     "path": "path/to/file/inside/input/dir", #not mandatory in file, by default "key"
                     "name": "name of the field", #not mandatory in file, default "key"
                     "description": "a short description of what this field is used for" #not mandatory, default ""
                    }
                 }
                )
        """

        try:
            docker_connection = docker_connection or docker.Client(**kwargs_from_env())
            data = docker_connection.inspect_image(container_name)["ContainerConfig"]["Labels"]
        except:
            self.logger.warning("Cannot inspect container %s", container_name)
            return None, None, None

        if not "org.inginious.batch" in data:
            self.logger.warning("Container %s is not a batch container", container_name)
            return None, None, None

        title = data["org.inginious.batch.title"] if "org.inginious.batch.title" in data else container_name
        description = data["org.inginious.batch.description"] if "org.inginious.batch.description" in data else ""

        # Find valids keys
        args = {}
        for label in data:
            match = re.match(r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)$", label)
            if match and data[label] in ["file", "text"]:
                args[match.group(1)] = {"type": data[label]}

        # Parse additional metadata for the keys
        for label in data:
            match = re.match(r"^org\.inginious\.batch\.args\.([a-zA-Z0-9\-_]+)\.(name|description|path)$", label)
            if match and match.group(1) in args:
                if match.group(2) in ["name","description"]:
                    args[match.group(1)][match.group(2)] = data[label]
                elif match.group(2) == "path" and re.match(r"^[a-zA-Z\-_\./]+$",data[label]) and ".." not in data[label]:
                    args[match.group(1)]["path"] = data[label]

        # Add all the unknown metadata
        for key in args:
            if "name" not in args[key]: args[key]["name"] = key
            if "path" not in args[key]: args[key]["path"] = key
            if "description" not in args[key]: args[key]["description"] = ""

        return (title, description, args)

    def handle_batch_job(self, job_id, container_name, input_data):
        """ Creates, executes and returns the results of a batch job.
            The return value of a batch job is always a compressed(gz) tar file.
        :param job_id: The distant job id
        :param container_name: The container image to launch
        :param input_data: a dict containing all the keys of get_batch_container_metadata(container_name)[2].
            The values associated are file-like objects for "file" types and  strings for "text" types.
        :return: a dict, containing either:
            - {"retval":0, "stdout": "...", "stderr":"...", "file":"..."}
                if everything went well. (where file is a tgz file containing the content of the /output folder from the container)
            - {"retval":"...", "stdout": "...", "stderr":"..."}
                if the container crashed (retval is an int != 0)
            - {"retval":-1, "stderr": "the error message"}
                if the container failed to start
        """
        self.logger.info("Received request for jobid %s (batch job)", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'retval': -1, "stderr": "Failed to connect to Docker"}

        batch_args = self.handle_get_batch_container_metadata(container_name, docker_connection)[2]
        if batch_args is None:
            return {'retval': -1, "stderr": "Inspecting the batch container image failed"}

        container_path = os.path.join(self.tmp_dir, str(internal_job_id))  # tmp_dir/id/
        input_path = os.path.join(container_path, 'input')  # tmp_dir/id/input/
        output_path = os.path.join(container_path, 'output')  # tmp_dir/id/output/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(input_path)
        os.mkdir(output_path)
        os.chmod(container_path, 0777)
        os.chmod(input_path, 0777)
        os.chmod(output_path, 0777)

        try:
            if set(input_data.keys()) != set(batch_args.keys()):
                raise Exception("Invalid keys for inputdata")

            for key in batch_args:
                if batch_args[key]["type"] == "text":
                    if not isinstance(input_data[key], basestring):
                        raise Exception("Invalid value for inputdata: the value for key {} should be a string".format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]),'w').write(input_data[key])
                elif batch_args[key]["type"] == "file":
                    if isinstance(input_data[key], basestring):
                        raise Exception("Invalid value for inputdata: the value for key {} should be a file object".format(key))
                    open(os.path.join(input_path, batch_args[key]["path"]), 'w').write(input_data[key].read())
        except:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Invalid tgz for input'}

        # Run the container
        try:
            response = docker_connection.create_container(
                container_name,
                volumes={'/input': {}, '/output': {}}
            )
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(input_path): {'ro': False, 'bind': '/input'},
                                           os.path.abspath(output_path): {'ro': False, 'bind': '/output'}})
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Cannot start container'}

        # Wait for completion
        return_value = -1
        try:
            return_value = docker_connection.wait(container_id)
        except:
            self.logger.info("Container for job id %s crashed", job_id)

        # If docker cannot do anything...
        if return_value == -1:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Container crashed at startup'}

        # Get logs back
        stdout = ""
        stderr = ""
        try:
            stdout = str(docker_connection.logs(container_id, stdout=True, stderr=False))
            stderr = str(docker_connection.logs(container_id, stdout=True, stderr=False))
        except:
            self.logger.warning("Cannot get back stdout of container %s!", container_id)
            rmtree(container_path)
            return {'retval': -1, "stderr": 'Cannot retrieve stdout/stderr from container'}

        # If something went wrong, we can return now
        if return_value != 0:
            rmtree(container_path)
            return {'retval': return_value, "stdout": stdout, "stderr": stderr}

        # Else, we can tgz the files in /output
        try:
            tmpfile = tempfile.TemporaryFile()
            tar = tarfile.open(fileobj=tmpfile, mode='w:gz')
            tar.add(output_path, '/', True)
            tar.close()
            tmpfile.flush()
            tmpfile.seek(0)
        except:
            rmtree(container_path)
            return {'retval': -1, "stderr": 'The agent was unable to archive the /output directory'}

        return {'retval': return_value, "stdout": stdout, "stderr": stderr, "file": tmpfile}

    def handle_job(self, job_id, course_id, task_id, inputdata, debug, _callback_status):
        """ Creates, executes and returns the results of a new job
        :param job_id: The distant job id
        :param course_id: The course id of the linked task
        :param task_id: The task id of the linked task
        :param inputdata: Input data, given by the student (dict)
        :param debug: A boolean, indicating if the job should be run in debug mode or not
        :param _callback_status: Not used, should be None.
        """
        self.logger.info("Received request for jobid %s", job_id)
        internal_job_id = self._get_new_internal_job_id()
        self.logger.debug("New Internal job id -> %i", internal_job_id)

        # Initialize connection to Docker
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return {'result': 'crash', 'text': 'Cannot connect to Docker'}

        # Get back the task data (for the limits)
        try:
            task = Course(course_id).get_task(task_id)
        except:
            self.logger.warning("Task %s/%s unavailable on this agent", course_id, task_id)
            return {'result': 'crash', 'text': 'Task unavailable on agent. Please retry later, the agents should synchronize soon. If the error '
                                               'persists, please contact your course administrator.'}

        limits = task.get_limits()

        mem_limit = limits.get("memory", 100)
        if mem_limit < 20:
            mem_limit = 20

        environment = task.get_environment()
        if environment not in self.image_aliases:
            self.logger.warning("Task %s/%s ask for an unknown environment %s (not in aliases)", course_id, task_id, environment)
            return {'result': 'crash', 'text': 'Unknown container. Please contact your course administrator.'}
        environment = self.image_aliases[environment]

        # Remove possibly existing older folder and creates the new ones
        container_path = os.path.join(self.tmp_dir, str(internal_job_id))  # tmp_dir/id/
        task_path = os.path.join(container_path, 'task')  # tmp_dir/id/task/
        sockets_path = os.path.join(container_path, 'sockets')  # tmp_dir/id/socket/
        student_path = os.path.join(task_path, 'student')  # tmp_dir/id/task/student/
        try:
            rmtree(container_path)
        except:
            pass

        os.mkdir(container_path)
        os.mkdir(sockets_path)
        os.chmod(container_path, 0777)
        os.chmod(sockets_path, 0777)

        copytree(os.path.join(common.base.get_tasks_directory(), task.get_course_id(), task.get_id()), task_path)
        os.chmod(task_path, 0777)

        if not os.path.exists(student_path):
            os.mkdir(student_path)
            os.chmod(student_path, 0777)

        # Run the container
        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                volumes={'/task': {}, '/sockets': {}},
                network_disabled=True
            )
            container_id = response["Id"]

            # Start the RPyC server associated with this container
            container_set = set()
            student_container_management_service = self._get_agent_student_container_service(
                container_set,
                student_path,
                task.get_environment(),
                limits.get("time", 30),
                mem_limit)

            # Small workaround for error "AF_UNIX path too long" when the agent is launched inside a container. Resolve all symlinks to reduce the
            # path length.
            smaller_path_to_socket = os.path.realpath(os.path.join(sockets_path, 'INGInious.sock'))

            student_container_management = UnixSocketServer(
                student_container_management_service,
                socket_path=smaller_path_to_socket,
                protocol_config={"allow_public_attrs": True, 'allow_pickle': True})

            student_container_management_thread = threading.Thread(target=student_container_management.start)
            student_container_management_thread.daemon = True
            student_container_management_thread.start()

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(task_path): {'ro': False, 'bind': '/task'},
                                           os.path.abspath(sockets_path): {'ro': False, 'bind': '/sockets'}},
                                    mem_limit=mem_limit * 1024 * 1024,
                                    memswap_limit=mem_limit * 1024 * 1024,  # disable swap
                                    oom_kill_disable=True)

            # Send the input data
            container_input = {"input": inputdata, "limits": limits}
            if debug:
                container_input["debug"] = True
            docker_connection.attach_socket(container_id, {'stdin': 1, 'stream': 1}).send(json.dumps(container_input) + "\n")
        except Exception as e:
            self.logger.warning("Cannot start container! %s", str(e))
            rmtree(container_path)
            return {'result': 'crash', 'text': 'Cannot start container'}

        # Ask the "cgroup" thread to verify the timeout/memory limit
        self._timeout_watcher.add_container_timeout(container_id, limits.get("time", 30), limits.get('hard_time', limits.get("time", 30) * 3))
        self._memory_watcher.add_container_memory_limit(container_id, mem_limit)

        # Wait for completion
        error_occured = False
        try:
            return_value = docker_connection.wait(container_id, limits.get('hard_time', limits.get("time", 30) * 4))
            if return_value == -1:
                raise Exception('Container crashed!')
        except:
            self.logger.info("Container for job id %s crashed", job_id)
            error_occured = True

        # Verify that everything went well
        error_timeout = self._timeout_watcher.container_had_error(container_id)
        error_memory = self._memory_watcher.container_had_error(container_id)
        if error_timeout:
            result = {"result": "timeout"}
        elif error_memory:
            result = {"result": "overflow"}
        elif error_occured:
            result = {"result": "crash", "text": "An unknown error occurred while running the container"}
        else:
            # Get logs back
            try:
                stdout = str(docker_connection.logs(container_id, stdout=True, stderr=False))
                result = json.loads(stdout)
            except Exception as e:
                print e
                self.logger.warning("Cannot get back stdout of container %s!", container_id)
                result = {'result': 'crash', 'text': 'The grader did not return a readable output'}

        # Close RPyC server
        student_container_management.close()

        # Remove container
        thread.start_new_thread(docker_connection.remove_container, (container_id, True, False, True))

        # Remove subcontainers
        for i in container_set:
            # Also deletes them from the timeout/memory watchers
            self._timeout_watcher.container_had_error(container_id)
            self._memory_watcher.container_had_error(container_id)
            thread.start_new_thread(docker_connection.remove_container, (i, True, False, True))

        # Delete folders
        rmtree(container_path)

        # Return!
        return result

    def _create_new_student_container(self, container_name, working_dir, command, memory_limit, time_limit, hard_time_limit, container_set,
                                      student_path):
        self.logger.debug("Starting new student container... %s %s %s %s %s %s", container_name, working_dir, command, memory_limit, time_limit,
                          hard_time_limit)
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None, None, "Cannot connect to Docker!"

        mem_limit = memory_limit or 100
        if mem_limit < 20:
            mem_limit = 20

        if container_name not in self.image_aliases:
            self.logger.info("Unknown environment %s (not in aliases)", container_name)
            return None, None, "Unknown environment {} (not in aliases)".format(container_name)
        environment = self.image_aliases[container_name]

        try:
            response = docker_connection.create_container(
                environment,
                stdin_open=True,
                network_disabled=True,
                volumes={'/task/student': {}},
                command=command,
                working_dir=working_dir
            )
            container_id = response["Id"]

            # Start the container
            docker_connection.start(container_id,
                                    binds={os.path.abspath(student_path): {'ro': False, 'bind': '/task/student'}},
                                    mem_limit=mem_limit * 1024 * 1024,  # add 10 mo of bonus, as we check the memory in the "cgroup" thread
                                    memswap_limit=mem_limit * 1024 * 1024,  # disable swap
                                    oom_kill_disable=True
                                    )

            stdout_err = docker_connection.attach_socket(container_id, {'stdin': 0, 'stdout': 1, 'stderr': 1, 'stream': 1, 'logs': 1})
        except Exception as e:
            self.logger.warning("Cannot start container! %s", e)
            return None, None, "Cannot start container! {}".format(e)

        container_set.add(container_id)
        # Ask the "cgroup" thread to verify the timeout/memory limit
        self._timeout_watcher.add_container_timeout(container_id, time_limit, min(time_limit * 4, hard_time_limit))
        self._memory_watcher.add_container_memory_limit(container_id, mem_limit)

        self.logger.info("New student container started")
        return container_id, stdout_err, None

    def _student_container_signal(self, container_id, signalnum):
        self.logger.info("Sending signal %s to student container", str(signalnum))
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return False

        docker_connection.kill(container_id, signalnum)
        return True

    def _student_container_get_stdin(self, container_id):
        self.logger.info("Getting stdin of student container")
        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return None

        stdin = docker_connection.attach_socket(container_id, {'stdin': 1, 'stderr': 0, 'stdout': 0, 'stream': 1})
        self.logger.info("Returning stdin of student container")
        return stdin

    def _student_container_close(self, container_id, container_set):
        self.logger.info("Closing student container")

        try:
            docker_connection = docker.Client(**kwargs_from_env())
        except:
            self.logger.warning("Cannot connect to Docker!")
            return 254

        # Wait for completion
        return_value = 254
        try:
            return_value = docker_connection.wait(container_id)
            if return_value == -1:
                return_value = 254
                raise Exception('Container crashed!')
        except:
            pass

        # Verify that everything went well
        if self._timeout_watcher.container_had_error(container_id):
            return_value = 253
        if self._memory_watcher.container_had_error(container_id):
            return_value = 252

        # Remove container
        thread.start_new_thread(docker_connection.remove_container, (container_id, True, False, True))
        container_set.remove(container_id)

        # Return!
        return return_value

    def _get_agent_student_container_service(self, container_set, student_path, default_container, default_time, default_memory):
        create_new_student_container = self._create_new_student_container
        student_container_signal = self._student_container_signal
        student_container_get_stdin = self._student_container_get_stdin
        student_container_close = self._student_container_close

        class StudentContainerManagementService(rpyc.Service):

            def exposed_run(self, container_name, working_dir, command, memory_limit, time_limit, hard_time_limit):
                if container_name == "":
                    container_name = default_container
                if memory_limit == 0:
                    memory_limit = default_memory
                if time_limit == 0:
                    time_limit = default_time
                if hard_time_limit == 0:
                    hard_time_limit = 3 * time_limit
                return create_new_student_container(str(container_name), str(working_dir), str(command), int(memory_limit), int(time_limit),
                                                    int(hard_time_limit), container_set, student_path)

            def exposed_signal(self, container_id, signalnum):
                if container_id in container_set:
                    return student_container_signal(str(container_id), int(signalnum))
                return None

            def exposed_stdin(self, container_id):
                if container_id in container_set:
                    return student_container_get_stdin(str(container_id))
                return None

            def exposed_close(self, container_id):
                if container_id in container_set:
                    return student_container_close(str(container_id), container_set)
                return None

        return StudentContainerManagementService