def _start_execution_in_container( self, args, stdin, stdout, stderr, env, root_dir, cwd, temp_dir, memlimit, memory_nodes, cgroups, output_dir, result_files_patterns, parent_setup_fn, child_setup_fn, parent_cleanup_fn, ): """Execute the given command and measure its resource usage similarly to super()._start_execution(), but inside a container implemented using Linux namespaces. The command has no network access (only loopback), a fresh directory as /tmp and no write access outside of this, and it does not see other processes except itself. """ assert self._use_namespaces if root_dir is None: env.update(self._env_override) # We have three processes involved: # parent: the current Python process in which RunExecutor is executing # child: child process in new namespace (PID 1 in inner namespace), # configures inner namespace, serves as dummy init, # collects result of grandchild and passes it to parent # grandchild: child of child process (PID 2 in inner namespace), exec()s tool # We need the following communication steps between these proceses: # 1a) grandchild tells parent its PID (in outer namespace). # 1b) grandchild tells parent that it is ready and measurement should begin. # 2) parent tells grandchild that measurement has begun and tool should # be exec()ed. # 3) child tells parent about return value and resource consumption of # grandchild. # 1a and 1b are done together by sending the PID through a pipe. # 2 is done by sending a null byte through a pipe. # 3 is done by sending a pickled object through the same pipe as #2. # We cannot use the same pipe for both directions, because otherwise a sender # might read the bytes it has sent itself. # Error codes from child to parent CHILD_OSERROR = 128 # noqa: N806 local constant CHILD_UNKNOWN_ERROR = 129 # noqa: N806 local constant # "downstream" pipe parent->grandchild from_parent, to_grandchild = os.pipe() # "upstream" pipe grandchild/child->parent from_grandchild, to_parent = os.pipe() # The protocol for these pipes is that first the parent sends the marker for # user mappings, then the grand child sends its outer PID back, # and finally the parent sends its completion marker. # After the run, the child sends the result of the grand child and then waits # for the post_run marker, before it terminates. MARKER_USER_MAPPING_COMPLETED = b"A" # noqa: N806 local constant MARKER_PARENT_COMPLETED = b"B" # noqa: N806 local constant MARKER_PARENT_POST_RUN_COMPLETED = b"C" # noqa: N806 local constant # If the current directory is within one of the bind mounts we create, # we need to cd into this directory again, otherwise we would not see the # bind mount, but the directory behind it. # Thus we always set cwd to force a change of directory. if root_dir is None: cwd = os.path.abspath(cwd or os.curdir) else: root_dir = os.path.abspath(root_dir) cwd = os.path.abspath(cwd) def grandchild(): """Setup everything inside the process that finally exec()s the tool.""" try: # We know that this process has PID 2 in the inner namespace, # but we actually need to know its PID in the outer namespace # such that parent can put us into the correct cgroups. According to # http://man7.org/linux/man-pages/man7/pid_namespaces.7.html, # there are two ways to achieve this: sending a message with the PID # via a socket (but Python 2 lacks a convenient API for sendmsg), # and reading /proc/self in the outer procfs instance # (that's what we do). my_outer_pid = container.get_my_pid_from_procfs() container.mount_proc(self._container_system_config) container.drop_capabilities() container.reset_signal_handling() child_setup_fn() # Do some other setup the caller wants. # Signal readiness to parent by sending our PID # and wait until parent is also ready os.write(to_parent, str(my_outer_pid).encode()) received = os.read(from_parent, 1) assert received == MARKER_PARENT_COMPLETED, received finally: # close remaining ends of pipe os.close(from_parent) os.close(to_parent) # here Python will exec() the tool for us def child(): """Setup everything inside the container, start the tool, and wait for result.""" try: logging.debug( "Child: child process of RunExecutor with PID %d started", container.get_my_pid_from_procfs(), ) # Put all received signals on hold until we handle them later. container.block_all_signals() # We want to avoid leaking file descriptors to the executed child. # It is also nice if the child has only the minimal necessary file # descriptors, to avoid keeping other pipes and files open, e.g., # those that the parent uses to communicate with other containers # (if containers are started in parallel). # Thus we do not use the close_fds feature of subprocess.Popen, # but do the same here manually. We keep the relevant ends of our pipes, # and stdin/out/err of child and grandchild. necessary_fds = { sys.stdin, sys.stdout, sys.stderr, to_parent, from_parent, stdin, stdout, stderr, } - {None} container.close_open_fds(keep_files=necessary_fds) try: if self._container_system_config: # A standard hostname increases reproducibility. socket.sethostname(container.CONTAINER_HOSTNAME) if not self._allow_network: container.activate_network_interface("lo") # Wait until user mapping is finished, # this is necessary for filesystem writes received = os.read(from_parent, len(MARKER_USER_MAPPING_COMPLETED)) assert received == MARKER_USER_MAPPING_COMPLETED, received if root_dir is not None: self._setup_root_filesystem(root_dir) else: self._setup_container_filesystem( temp_dir, output_dir if result_files_patterns else None, memlimit, memory_nodes, ) # Marking this process as "non-dumpable" (no core dumps) also # forbids several other ways how other processes can access and # influence it: # ptrace is forbidden and much of /proc/<child>/ is inaccessible. # We set this to prevent the benchmarked tool from messing with this # process or using it to escape from the container. More info: # http://man7.org/linux/man-pages/man5/proc.5.html # It needs to be done after MARKER_USER_MAPPING_COMPLETED. libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0) except OSError as e: logging.critical("Failed to configure container: %s", e) return CHILD_OSERROR try: os.chdir(cwd) except OSError as e: logging.critical( "Cannot change into working directory inside container: %s", e) return CHILD_OSERROR container.setup_seccomp_filter() try: grandchild_proc = subprocess.Popen( args, stdin=stdin, stdout=stdout, stderr=stderr, env=env, close_fds=False, preexec_fn=grandchild, ) except (OSError, RuntimeError) as e: logging.critical("Cannot start process: %s", e) return CHILD_OSERROR # keep capability for unmount if necessary later necessary_capabilities = ([libc.CAP_SYS_ADMIN] if result_files_patterns else []) container.drop_capabilities(keep=necessary_capabilities) # Close other fds that were still necessary above. container.close_open_fds(keep_files={ sys.stdout, sys.stderr, to_parent, from_parent }) # Set up signal handlers to forward signals to grandchild # (because we are PID 1, there is a special signal handling otherwise). # cf. dumb-init project: https://github.com/Yelp/dumb-init # Also wait for grandchild and return its result. grandchild_result = container.wait_for_child_and_forward_signals( grandchild_proc.pid, args[0]) logging.debug( "Child: process %s terminated with exit code %d.", args[0], grandchild_result[0], ) if result_files_patterns: # Remove the bind mount that _setup_container_filesystem added # such that the parent can access the result files. libc.umount(temp_dir.encode()) # Re-allow access to /proc/<child>/..., # this is used by the parent for accessing output files libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0) os.write(to_parent, pickle.dumps(grandchild_result)) os.close(to_parent) # Now the parent copies the output files, we need to wait until this is # finished. If the child terminates, the container file system and its # tmpfs go away. assert os.read(from_parent, 1) == MARKER_PARENT_POST_RUN_COMPLETED os.close(from_parent) return 0 except OSError: logging.exception("Error in child process of RunExecutor") return CHILD_OSERROR except BaseException: # Need to catch everything because this method always needs to return an # int (we are inside a C callback that requires returning int). logging.exception("Error in child process of RunExecutor") return CHILD_UNKNOWN_ERROR try: # parent try: child_pid = container.execute_in_namespace( child, use_network_ns=not self._allow_network) except OSError as e: if (e.errno == errno.EPERM and util.try_read_file( "/proc/sys/kernel/unprivileged_userns_clone") == "0"): raise BenchExecException( "Unprivileged user namespaces forbidden on this system, please " "enable them with 'sysctl -w kernel.unprivileged_userns_clone=1' " "or disable container mode") elif (e.errno in {errno.ENOSPC, errno.EINVAL} and util.try_read_file("/proc/sys/user/max_user_namespaces") == "0"): # Ubuntu has ENOSPC, Centos seems to produce EINVAL in this case raise BenchExecException( "Unprivileged user namespaces forbidden on this system, please " "enable by using 'sysctl -w user.max_user_namespaces=10000' " "(or another value) or disable container mode") else: raise BenchExecException( "Creating namespace for container mode failed: " + os.strerror(e.errno)) logging.debug( "Parent: child process of RunExecutor with PID %d started.", child_pid) def check_child_exit_code(): """Check if the child process terminated cleanly and raise an error otherwise.""" child_exitcode, unused_child_rusage = self._wait_for_process( child_pid, args[0]) child_exitcode = util.ProcessExitCode.from_raw(child_exitcode) logging.debug( "Parent: child process of RunExecutor with PID %d" " terminated with %s.", child_pid, child_exitcode, ) if child_exitcode: if child_exitcode.value: if child_exitcode.value == CHILD_OSERROR: # This was an OSError in the child, # details were already logged raise BenchExecException( "execution in container failed, check log for details" ) elif child_exitcode.value == CHILD_UNKNOWN_ERROR: raise BenchExecException( "unexpected error in container") raise OSError(child_exitcode.value, os.strerror(child_exitcode.value)) raise OSError( 0, "Child process of RunExecutor terminated with " + str(child_exitcode), ) # Close unnecessary ends of pipes such that read() does not block forever # if all other processes have terminated. os.close(from_parent) os.close(to_parent) container.setup_user_mapping(child_pid, uid=self._uid, gid=self._gid) # signal child to continue os.write(to_grandchild, MARKER_USER_MAPPING_COMPLETED) try: # read at most 10 bytes because this is enough for 32bit int grandchild_pid = int(os.read(from_grandchild, 10)) except ValueError: # probably empty read, i.e., pipe closed, # i.e., child or grandchild failed check_child_exit_code() assert False, ( "Child process of RunExecutor terminated cleanly" " but did not send expected data.") logging.debug( "Parent: executing %s in grand child with PID %d" " via child with PID %d.", args[0], grandchild_pid, child_pid, ) # start measurements cgroups.add_task(grandchild_pid) parent_setup = parent_setup_fn() # Signal grandchild that setup is finished os.write(to_grandchild, MARKER_PARENT_COMPLETED) # Copy file descriptor, otherwise we could not close from_grandchild in # finally block and would leak a file descriptor in case of exception. from_grandchild_copy = os.dup(from_grandchild) to_grandchild_copy = os.dup(to_grandchild) finally: os.close(from_grandchild) os.close(to_grandchild) def wait_for_grandchild(): # 1024 bytes ought to be enough for everyone^Wour pickled result try: received = os.read(from_grandchild_copy, 1024) except OSError as e: if self.PROCESS_KILLED and e.errno == errno.EINTR: # Read was interrupted because of Ctrl+C, we just try again received = os.read(from_grandchild_copy, 1024) else: raise e if not received: # Typically this means the child exited prematurely because an error # occurred, and check_child_exitcode() will handle this. # We close the pipe first, otherwise child could hang infinitely. os.close(from_grandchild_copy) os.close(to_grandchild_copy) check_child_exit_code() assert False, "Child process terminated cleanly without sending result" exitcode, ru_child = pickle.loads(received) base_path = "/proc/{}/root".format(child_pid) parent_cleanup = parent_cleanup_fn( parent_setup, util.ProcessExitCode.from_raw(exitcode), base_path) if result_files_patterns: # As long as the child process exists # we can access the container file system here self._transfer_output_files(base_path + temp_dir, cwd, output_dir, result_files_patterns) os.close(from_grandchild_copy) os.write(to_grandchild_copy, MARKER_PARENT_POST_RUN_COMPLETED) os.close(to_grandchild_copy) # signal child that it can terminate check_child_exit_code() return exitcode, ru_child, parent_cleanup return grandchild_pid, wait_for_grandchild
def _init_container( temp_dir, network_access, dir_modes, container_system_config, container_tmpfs, # ignored, tmpfs is always used ): """ Create a fork of this process in a container. This method only returns in the fork, so calling it seems like moving the current process into a container. """ # Prepare for private home directory, some tools write there if container_system_config: dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN) os.environ["HOME"] = container.CONTAINER_HOME # Preparations temp_dir = temp_dir.encode() dir_modes = collections.OrderedDict( sorted( ((path.encode(), kind) for (path, kind) in dir_modes.items()), key=lambda tupl: len(tupl[0]), ) ) uid = container.CONTAINER_UID if container_system_config else os.getuid() gid = container.CONTAINER_GID if container_system_config else os.getgid() # Create container. # Contrary to ContainerExecutor, which uses clone to start a new process in new # namespaces, we use unshare, which puts the current process (the multiprocessing # worker process) into new namespaces. # The exception is the PID namespace, which will only apply to children processes. flags = ( libc.CLONE_NEWNS | libc.CLONE_NEWUTS | libc.CLONE_NEWIPC | libc.CLONE_NEWUSER | libc.CLONE_NEWPID ) if not network_access: flags |= libc.CLONE_NEWNET try: libc.unshare(flags) except OSError as e: if ( e.errno == errno.EPERM and util.try_read_file("/proc/sys/kernel/unprivileged_userns_clone") == "0" ): raise BenchExecException( "Unprivileged user namespaces forbidden on this system, please " "enable them with 'sysctl kernel.unprivileged_userns_clone=1' " "or disable container mode" ) else: raise BenchExecException( "Creating namespace for container mode failed: " + os.strerror(e.errno) ) # Container config container.setup_user_mapping(os.getpid(), uid, gid) _setup_container_filesystem(temp_dir, dir_modes, container_system_config) if container_system_config: libc.sethostname(container.CONTAINER_HOSTNAME) if not network_access: container.activate_network_interface("lo") # Because this process is not actually in the new PID namespace, we fork. # The child will be in the new PID namespace and will assume the role of the acting # multiprocessing worker (which it can do because it inherits the file descriptors # that multiprocessing uses for communication). # The original multiprocessing worker (the parent of the fork) must do nothing in # order to not confuse multiprocessing. pid = os.fork() if pid: container.drop_capabilities() # block parent such that it does nothing os.waitpid(pid, 0) os._exit(0) # Finalize container setup in child container.mount_proc(container_system_config) # only possible in child container.drop_capabilities() libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0) container.setup_seccomp_filter()