def child(): """Setup everything inside the container, start the tool, and wait for result.""" try: logging.debug( "Child: child process of RunExecutor with PID %d started", container.get_my_pid_from_procfs(), ) # Put all received signals on hold until we handle them later. container.block_all_signals() # We want to avoid leaking file descriptors to the executed child. # It is also nice if the child has only the minimal necessary file # descriptors, to avoid keeping other pipes and files open, e.g., # those that the parent uses to communicate with other containers # (if containers are started in parallel). # Thus we do not use the close_fds feature of subprocess.Popen, # but do the same here manually. We keep the relevant ends of our pipes, # and stdin/out/err of child and grandchild. necessary_fds = { sys.stdin, sys.stdout, sys.stderr, to_parent, from_parent, stdin, stdout, stderr, } - {None} container.close_open_fds(keep_files=necessary_fds) try: if self._container_system_config: # A standard hostname increases reproducibility. socket.sethostname(container.CONTAINER_HOSTNAME) if not self._allow_network: container.activate_network_interface("lo") # Wait until user mapping is finished, # this is necessary for filesystem writes received = os.read(from_parent, len(MARKER_USER_MAPPING_COMPLETED)) assert received == MARKER_USER_MAPPING_COMPLETED, received if root_dir is not None: self._setup_root_filesystem(root_dir) else: self._setup_container_filesystem( temp_dir, output_dir if result_files_patterns else None, memlimit, memory_nodes, ) # Marking this process as "non-dumpable" (no core dumps) also # forbids several other ways how other processes can access and # influence it: # ptrace is forbidden and much of /proc/<child>/ is inaccessible. # We set this to prevent the benchmarked tool from messing with this # process or using it to escape from the container. More info: # http://man7.org/linux/man-pages/man5/proc.5.html # It needs to be done after MARKER_USER_MAPPING_COMPLETED. libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0) except OSError as e: logging.critical("Failed to configure container: %s", e) return CHILD_OSERROR try: os.chdir(cwd) except OSError as e: logging.critical( "Cannot change into working directory inside container: %s", e) return CHILD_OSERROR container.setup_seccomp_filter() try: grandchild_proc = subprocess.Popen( args, stdin=stdin, stdout=stdout, stderr=stderr, env=env, close_fds=False, preexec_fn=grandchild, ) except (OSError, RuntimeError) as e: logging.critical("Cannot start process: %s", e) return CHILD_OSERROR # keep capability for unmount if necessary later necessary_capabilities = ([libc.CAP_SYS_ADMIN] if result_files_patterns else []) container.drop_capabilities(keep=necessary_capabilities) # Close other fds that were still necessary above. container.close_open_fds(keep_files={ sys.stdout, sys.stderr, to_parent, from_parent }) # Set up signal handlers to forward signals to grandchild # (because we are PID 1, there is a special signal handling otherwise). # cf. dumb-init project: https://github.com/Yelp/dumb-init # Also wait for grandchild and return its result. grandchild_result = container.wait_for_child_and_forward_signals( grandchild_proc.pid, args[0]) logging.debug( "Child: process %s terminated with exit code %d.", args[0], grandchild_result[0], ) if result_files_patterns: # Remove the bind mount that _setup_container_filesystem added # such that the parent can access the result files. libc.umount(temp_dir.encode()) # Re-allow access to /proc/<child>/..., # this is used by the parent for accessing output files libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0) os.write(to_parent, pickle.dumps(grandchild_result)) os.close(to_parent) # Now the parent copies the output files, we need to wait until this is # finished. If the child terminates, the container file system and its # tmpfs go away. assert os.read(from_parent, 1) == MARKER_PARENT_POST_RUN_COMPLETED os.close(from_parent) return 0 except OSError: logging.exception("Error in child process of RunExecutor") return CHILD_OSERROR except BaseException: # Need to catch everything because this method always needs to return an # int (we are inside a C callback that requires returning int). logging.exception("Error in child process of RunExecutor") return CHILD_UNKNOWN_ERROR
def _init_container( temp_dir, network_access, dir_modes, container_system_config, container_tmpfs, # ignored, tmpfs is always used ): """ Create a fork of this process in a container. This method only returns in the fork, so calling it seems like moving the current process into a container. """ # Prepare for private home directory, some tools write there if container_system_config: dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN) os.environ["HOME"] = container.CONTAINER_HOME # Preparations temp_dir = temp_dir.encode() dir_modes = collections.OrderedDict( sorted( ((path.encode(), kind) for (path, kind) in dir_modes.items()), key=lambda tupl: len(tupl[0]), ) ) uid = container.CONTAINER_UID if container_system_config else os.getuid() gid = container.CONTAINER_GID if container_system_config else os.getgid() # Create container. # Contrary to ContainerExecutor, which uses clone to start a new process in new # namespaces, we use unshare, which puts the current process (the multiprocessing # worker process) into new namespaces. # The exception is the PID namespace, which will only apply to children processes. flags = ( libc.CLONE_NEWNS | libc.CLONE_NEWUTS | libc.CLONE_NEWIPC | libc.CLONE_NEWUSER | libc.CLONE_NEWPID ) if not network_access: flags |= libc.CLONE_NEWNET try: libc.unshare(flags) except OSError as e: if ( e.errno == errno.EPERM and util.try_read_file("/proc/sys/kernel/unprivileged_userns_clone") == "0" ): raise BenchExecException( "Unprivileged user namespaces forbidden on this system, please " "enable them with 'sysctl kernel.unprivileged_userns_clone=1' " "or disable container mode" ) else: raise BenchExecException( "Creating namespace for container mode failed: " + os.strerror(e.errno) ) # Container config container.setup_user_mapping(os.getpid(), uid, gid) _setup_container_filesystem(temp_dir, dir_modes, container_system_config) if container_system_config: libc.sethostname(container.CONTAINER_HOSTNAME) if not network_access: container.activate_network_interface("lo") # Because this process is not actually in the new PID namespace, we fork. # The child will be in the new PID namespace and will assume the role of the acting # multiprocessing worker (which it can do because it inherits the file descriptors # that multiprocessing uses for communication). # The original multiprocessing worker (the parent of the fork) must do nothing in # order to not confuse multiprocessing. pid = os.fork() if pid: container.drop_capabilities() # block parent such that it does nothing os.waitpid(pid, 0) os._exit(0) # Finalize container setup in child container.mount_proc(container_system_config) # only possible in child container.drop_capabilities() libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0) container.setup_seccomp_filter()
def _init_container_and_load_tool( tool_module, temp_dir, network_access, dir_modes, container_system_config, container_tmpfs, # ignored, tmpfs is always used ): """Initialize container for the current process and load given tool-info module.""" # Prepare for private home directory, some tools write there if container_system_config: dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN) os.environ["HOME"] = container.CONTAINER_HOME # Preparations temp_dir = temp_dir.encode() dir_modes = collections.OrderedDict( sorted( ((path.encode(), kind) for (path, kind) in dir_modes.items()), key=lambda tupl: len(tupl[0]), )) uid = container.CONTAINER_UID if container_system_config else os.getuid() gid = container.CONTAINER_GID if container_system_config else os.getgid() # Create container. # Contrary to ContainerExecutor, which uses clone to start a new process in new # namespaces, we use unshare, which puts the current process (the multiprocessing # worker process) into new namespaces. # The exception is the PID namespace, which will only apply to children processes. flags = (libc.CLONE_NEWNS | libc.CLONE_NEWUTS | libc.CLONE_NEWIPC | libc.CLONE_NEWUSER | libc.CLONE_NEWPID) if not network_access: flags |= libc.CLONE_NEWNET libc.unshare(flags) # Container config container.setup_user_mapping(os.getpid(), uid, gid) _setup_container_filesystem(temp_dir, dir_modes, container_system_config) if container_system_config: libc.sethostname(container.CONTAINER_HOSTNAME) if not network_access: container.activate_network_interface("lo") # Because this process is not actually in the new PID namespace, we fork. # The child will be in the new PID namespace and will assume the role of the acting # multiprocessing worker (which it can do because it inherits the file descriptors # that multiprocessing uses for communication). # The original multiprocessing worker (the parent of the fork) must do nothing in # order to not confuse multiprocessing. pid = os.fork() if pid: container.drop_capabilities() # block parent such that it does nothing os.waitpid(pid, 0) os._exit(0) # Finalize container setup in child container.mount_proc(container_system_config) # only possible in child container.drop_capabilities() libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0) logging.debug("Loading tool-info module %s in container", tool_module) global tool try: tool = __import__(tool_module, fromlist=["Tool"]).Tool() except BaseException as e: tool = None return e return tool.__doc__
def _load_seccomp(): # First check if seccomp has a chance of working. try: libc.prctl(libc.PR_GET_SECCOMP, 0, 0, 0, 0) except OSError as e: logging.warning( "Seccomp is not available, container isolation is degraded (%s).", os.strerror(e.errno), ) return False try: libc.prctl(libc.PR_SET_SECCOMP, _SECCOMP_MODE_FILTER, 0, 0, 0) except OSError as e: # EFAULT is expected if passing null pointer as filter argument if e.errno != errno.EFAULT: logging.warning( "Unexpected failure when enabling seccomp filter, " "container isolation is degraded (%s).", e, ) return False else: logging.warning( "Unexpected failure when enabling seccomp filter, " "container isolation is degraded." ) return False # Load library with utility functions. global _lib try: _lib = ctypes.CDLL("libseccomp.so.2", use_errno=True) except OSError as e: logging.warning( "Could not load libseccomp2, " "please install it for improved container isolation (%s).", e, ) return False _lib.seccomp_init.argtypes = [c_uint32] _lib.seccomp_init.restype = _scmp_filter_ctx _lib.seccomp_init.errcheck = _check_null _lib.seccomp_release.argtypes = [_scmp_filter_ctx] _lib.seccomp_release.restype = None _lib.seccomp_export_pfc.argtypes = [_scmp_filter_ctx, c_int] _lib.seccomp_export_pfc.errcheck = _check_errno _lib.seccomp_load.argtypes = [_scmp_filter_ctx] _lib.seccomp_load.errcheck = _check_errno _lib.seccomp_arch_resolve_name.argtypes = [c_char_p] _lib.seccomp_arch_resolve_name.restype = c_uint32 _lib.seccomp_arch_add.argtypes = [_scmp_filter_ctx, c_uint32] _lib.seccomp_arch_add.errcheck = _check_errno _lib.seccomp_syscall_resolve_name.argtypes = [c_char_p] _lib.seccomp_syscall_resolve_name.restype = c_int _lib.seccomp_rule_add.argtypes = [_scmp_filter_ctx, c_uint32, c_int, c_uint] _lib.seccomp_rule_add.errcheck = _check_errno return True