コード例 #1
0
    def _start_execution_in_container(
        self,
        args,
        stdin,
        stdout,
        stderr,
        env,
        root_dir,
        cwd,
        temp_dir,
        memlimit,
        memory_nodes,
        cgroups,
        output_dir,
        result_files_patterns,
        parent_setup_fn,
        child_setup_fn,
        parent_cleanup_fn,
    ):
        """Execute the given command and measure its resource usage similarly to
        super()._start_execution(), but inside a container implemented using Linux
        namespaces.  The command has no network access (only loopback),
        a fresh directory as /tmp and no write access outside of this,
        and it does not see other processes except itself.
        """
        assert self._use_namespaces

        if root_dir is None:
            env.update(self._env_override)

        # We have three processes involved:
        # parent: the current Python process in which RunExecutor is executing
        # child: child process in new namespace (PID 1 in inner namespace),
        #        configures inner namespace, serves as dummy init,
        #        collects result of grandchild and passes it to parent
        # grandchild: child of child process (PID 2 in inner namespace), exec()s tool

        # We need the following communication steps between these proceses:
        # 1a) grandchild tells parent its PID (in outer namespace).
        # 1b) grandchild tells parent that it is ready and measurement should begin.
        # 2) parent tells grandchild that measurement has begun and tool should
        #    be exec()ed.
        # 3) child tells parent about return value and resource consumption of
        #    grandchild.
        # 1a and 1b are done together by sending the PID through a pipe.
        # 2 is done by sending a null byte through a pipe.
        # 3 is done by sending a pickled object through the same pipe as #2.
        # We cannot use the same pipe for both directions, because otherwise a sender
        # might read the bytes it has sent itself.

        # Error codes from child to parent
        CHILD_OSERROR = 128  # noqa: N806 local constant
        CHILD_UNKNOWN_ERROR = 129  # noqa: N806 local constant

        # "downstream" pipe parent->grandchild
        from_parent, to_grandchild = os.pipe()
        # "upstream" pipe grandchild/child->parent
        from_grandchild, to_parent = os.pipe()

        # The protocol for these pipes is that first the parent sends the marker for
        # user mappings, then the grand child sends its outer PID back,
        # and finally the parent sends its completion marker.
        # After the run, the child sends the result of the grand child and then waits
        # for the post_run marker, before it terminates.
        MARKER_USER_MAPPING_COMPLETED = b"A"  # noqa: N806 local constant
        MARKER_PARENT_COMPLETED = b"B"  # noqa: N806 local constant
        MARKER_PARENT_POST_RUN_COMPLETED = b"C"  # noqa: N806 local constant

        # If the current directory is within one of the bind mounts we create,
        # we need to cd into this directory again, otherwise we would not see the
        # bind mount, but the directory behind it.
        # Thus we always set cwd to force a change of directory.
        if root_dir is None:
            cwd = os.path.abspath(cwd or os.curdir)
        else:
            root_dir = os.path.abspath(root_dir)
            cwd = os.path.abspath(cwd)

        def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.  According to
                # http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python 2 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance
                # (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc(self._container_system_config)
                container.drop_capabilities()
                container.reset_signal_handling()
                child_setup_fn()  # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID
                # and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == MARKER_PARENT_COMPLETED, received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
            # here Python will exec() the tool for us

        def child():
            """Setup everything inside the container,
            start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs(),
                )

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file
                # descriptors, to avoid keeping other pipes and files open, e.g.,
                # those that the parent uses to communicate with other containers
                # (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually. We keep the relevant ends of our pipes,
                # and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin,
                    sys.stdout,
                    sys.stderr,
                    to_parent,
                    from_parent,
                    stdin,
                    stdout,
                    stderr,
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        socket.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished,
                    # this is necessary for filesystem writes
                    received = os.read(from_parent,
                                       len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes,
                        )

                    # Marking this process as "non-dumpable" (no core dumps) also
                    # forbids several other ways how other processes can access and
                    # influence it:
                    # ptrace is forbidden and much of /proc/<child>/ is inaccessible.
                    # We set this to prevent the benchmarked tool from messing with this
                    # process or using it to escape from the container. More info:
                    # http://man7.org/linux/man-pages/man5/proc.5.html
                    # It needs to be done after MARKER_USER_MAPPING_COMPLETED.
                    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0,
                               0, 0)
                except OSError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except OSError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                container.setup_seccomp_filter()

                try:
                    grandchild_proc = subprocess.Popen(
                        args,
                        stdin=stdin,
                        stdout=stdout,
                        stderr=stderr,
                        env=env,
                        close_fds=False,
                        preexec_fn=grandchild,
                    )
                except (OSError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = ([libc.CAP_SYS_ADMIN]
                                          if result_files_patterns else [])
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={
                    sys.stdout, sys.stderr, to_parent, from_parent
                })

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                grandchild_result = container.wait_for_child_and_forward_signals(
                    grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.",
                    args[0],
                    grandchild_result[0],
                )

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                # Re-allow access to /proc/<child>/...,
                # this is used by the parent for accessing output files
                libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0)

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is
                # finished. If the child terminates, the container file system and its
                # tmpfs go away.
                assert os.read(from_parent,
                               1) == MARKER_PARENT_POST_RUN_COMPLETED
                os.close(from_parent)

                return 0
            except OSError:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except BaseException:
                # Need to catch everything because this method always needs to return an
                # int (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR

        try:  # parent
            try:
                child_pid = container.execute_in_namespace(
                    child, use_network_ns=not self._allow_network)
            except OSError as e:
                if (e.errno == errno.EPERM and util.try_read_file(
                        "/proc/sys/kernel/unprivileged_userns_clone") == "0"):
                    raise BenchExecException(
                        "Unprivileged user namespaces forbidden on this system, please "
                        "enable them with 'sysctl -w kernel.unprivileged_userns_clone=1' "
                        "or disable container mode")
                elif (e.errno in {errno.ENOSPC, errno.EINVAL} and
                      util.try_read_file("/proc/sys/user/max_user_namespaces")
                      == "0"):
                    # Ubuntu has ENOSPC, Centos seems to produce EINVAL in this case
                    raise BenchExecException(
                        "Unprivileged user namespaces forbidden on this system, please "
                        "enable by using 'sysctl -w user.max_user_namespaces=10000' "
                        "(or another value) or disable container mode")
                else:
                    raise BenchExecException(
                        "Creating namespace for container mode failed: " +
                        os.strerror(e.errno))
            logging.debug(
                "Parent: child process of RunExecutor with PID %d started.",
                child_pid)

            def check_child_exit_code():
                """Check if the child process terminated cleanly
                and raise an error otherwise."""
                child_exitcode, unused_child_rusage = self._wait_for_process(
                    child_pid, args[0])
                child_exitcode = util.ProcessExitCode.from_raw(child_exitcode)
                logging.debug(
                    "Parent: child process of RunExecutor with PID %d"
                    " terminated with %s.",
                    child_pid,
                    child_exitcode,
                )

                if child_exitcode:
                    if child_exitcode.value:
                        if child_exitcode.value == CHILD_OSERROR:
                            # This was an OSError in the child,
                            # details were already logged
                            raise BenchExecException(
                                "execution in container failed, check log for details"
                            )
                        elif child_exitcode.value == CHILD_UNKNOWN_ERROR:
                            raise BenchExecException(
                                "unexpected error in container")
                        raise OSError(child_exitcode.value,
                                      os.strerror(child_exitcode.value))
                    raise OSError(
                        0,
                        "Child process of RunExecutor terminated with " +
                        str(child_exitcode),
                    )

            # Close unnecessary ends of pipes such that read() does not block forever
            # if all other processes have terminated.
            os.close(from_parent)
            os.close(to_parent)

            container.setup_user_mapping(child_pid,
                                         uid=self._uid,
                                         gid=self._gid)
            # signal child to continue
            os.write(to_grandchild, MARKER_USER_MAPPING_COMPLETED)

            try:
                # read at most 10 bytes because this is enough for 32bit int
                grandchild_pid = int(os.read(from_grandchild, 10))
            except ValueError:
                # probably empty read, i.e., pipe closed,
                # i.e., child or grandchild failed
                check_child_exit_code()
                assert False, (
                    "Child process of RunExecutor terminated cleanly"
                    " but did not send expected data.")

            logging.debug(
                "Parent: executing %s in grand child with PID %d"
                " via child with PID %d.",
                args[0],
                grandchild_pid,
                child_pid,
            )

            # start measurements
            cgroups.add_task(grandchild_pid)
            parent_setup = parent_setup_fn()

            # Signal grandchild that setup is finished
            os.write(to_grandchild, MARKER_PARENT_COMPLETED)

            # Copy file descriptor, otherwise we could not close from_grandchild in
            # finally block and would leak a file descriptor in case of exception.
            from_grandchild_copy = os.dup(from_grandchild)
            to_grandchild_copy = os.dup(to_grandchild)
        finally:
            os.close(from_grandchild)
            os.close(to_grandchild)

        def wait_for_grandchild():
            # 1024 bytes ought to be enough for everyone^Wour pickled result
            try:
                received = os.read(from_grandchild_copy, 1024)
            except OSError as e:
                if self.PROCESS_KILLED and e.errno == errno.EINTR:
                    # Read was interrupted because of Ctrl+C, we just try again
                    received = os.read(from_grandchild_copy, 1024)
                else:
                    raise e

            if not received:
                # Typically this means the child exited prematurely because an error
                # occurred, and check_child_exitcode() will handle this.
                # We close the pipe first, otherwise child could hang infinitely.
                os.close(from_grandchild_copy)
                os.close(to_grandchild_copy)
                check_child_exit_code()
                assert False, "Child process terminated cleanly without sending result"

            exitcode, ru_child = pickle.loads(received)

            base_path = "/proc/{}/root".format(child_pid)
            parent_cleanup = parent_cleanup_fn(
                parent_setup, util.ProcessExitCode.from_raw(exitcode),
                base_path)

            if result_files_patterns:
                # As long as the child process exists
                # we can access the container file system here
                self._transfer_output_files(base_path + temp_dir, cwd,
                                            output_dir, result_files_patterns)

            os.close(from_grandchild_copy)
            os.write(to_grandchild_copy, MARKER_PARENT_POST_RUN_COMPLETED)
            os.close(to_grandchild_copy)  # signal child that it can terminate
            check_child_exit_code()

            return exitcode, ru_child, parent_cleanup

        return grandchild_pid, wait_for_grandchild
コード例 #2
0
def _init_container(
    temp_dir,
    network_access,
    dir_modes,
    container_system_config,
    container_tmpfs,  # ignored, tmpfs is always used
):
    """
    Create a fork of this process in a container. This method only returns in the fork,
    so calling it seems like moving the current process into a container.
    """
    # Prepare for private home directory, some tools write there
    if container_system_config:
        dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN)
        os.environ["HOME"] = container.CONTAINER_HOME

    # Preparations
    temp_dir = temp_dir.encode()
    dir_modes = collections.OrderedDict(
        sorted(
            ((path.encode(), kind) for (path, kind) in dir_modes.items()),
            key=lambda tupl: len(tupl[0]),
        )
    )
    uid = container.CONTAINER_UID if container_system_config else os.getuid()
    gid = container.CONTAINER_GID if container_system_config else os.getgid()

    # Create container.
    # Contrary to ContainerExecutor, which uses clone to start a new process in new
    # namespaces, we use unshare, which puts the current process (the multiprocessing
    # worker process) into new namespaces.
    # The exception is the PID namespace, which will only apply to children processes.
    flags = (
        libc.CLONE_NEWNS
        | libc.CLONE_NEWUTS
        | libc.CLONE_NEWIPC
        | libc.CLONE_NEWUSER
        | libc.CLONE_NEWPID
    )
    if not network_access:
        flags |= libc.CLONE_NEWNET
    try:
        libc.unshare(flags)
    except OSError as e:
        if (
            e.errno == errno.EPERM
            and util.try_read_file("/proc/sys/kernel/unprivileged_userns_clone") == "0"
        ):
            raise BenchExecException(
                "Unprivileged user namespaces forbidden on this system, please "
                "enable them with 'sysctl kernel.unprivileged_userns_clone=1' "
                "or disable container mode"
            )
        else:
            raise BenchExecException(
                "Creating namespace for container mode failed: " + os.strerror(e.errno)
            )

    # Container config
    container.setup_user_mapping(os.getpid(), uid, gid)
    _setup_container_filesystem(temp_dir, dir_modes, container_system_config)
    if container_system_config:
        libc.sethostname(container.CONTAINER_HOSTNAME)
    if not network_access:
        container.activate_network_interface("lo")

    # Because this process is not actually in the new PID namespace, we fork.
    # The child will be in the new PID namespace and will assume the role of the acting
    # multiprocessing worker (which it can do because it inherits the file descriptors
    # that multiprocessing uses for communication).
    # The original multiprocessing worker (the parent of the fork) must do nothing in
    # order to not confuse multiprocessing.
    pid = os.fork()
    if pid:
        container.drop_capabilities()
        # block parent such that it does nothing
        os.waitpid(pid, 0)
        os._exit(0)

    # Finalize container setup in child
    container.mount_proc(container_system_config)  # only possible in child
    container.drop_capabilities()
    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0)
    container.setup_seccomp_filter()