def mount_proc(container_system_config): """Mount the /proc filesystem. @param container_system_config: Whether to mount container-specific files in /proc """ # We keep a reference to the outer /proc somewhere else because we need it # to convert our PID between the namespaces. libc.mount(b"proc", b"/proc", b"proc", 0, None) # lxcfs provides container-aware versions of several /proc files, e.g. /proc/uptime # If lxcfs is available, bind-mount these files over the kernel-provided files. if container_system_config and os.access(LXCFS_PROC_DIR, os.R_OK): for f in os.listdir(LXCFS_PROC_DIR): make_bind_mount( os.path.join(LXCFS_PROC_DIR, f), os.path.join(b"/proc", f), private=True ) # Making the above bind mounts on top of /proc breaks nested containers. # The reason for this is that the kernel does not allow mounting of a new proc # file system (in the nested container) if the nested container does not have # access to a clean and fully visible instance of the proc file system. # So we give the container two instances: One in the expected place, with lxcfs # bind mounts on top, and another one without these bind mounts that is hidden # somewhere and hopefully will never be used by anybody. It does not matter # where we hide the second proc instance, but we need a directory that always # exists and is never used. /proc/1/ns always exists and because we disable # PR_SET_DUMPABLE it would not be accessible anyway, so it fits the bill. libc.mount(b"proc", b"/proc/1/ns", b"proc", 0, None)
def make_overlay_mount(mount, lower, upper, work): logging.debug( "Creating overlay mount: target=%s, lower=%s, upper=%s, work=%s", mount, lower, upper, work) libc.mount( b"none", mount, b"overlay", 0, b"lowerdir=" + lower + b",upperdir=" + upper + b",workdir=" + work)
def make_overlay_mount(mount, lower, upper, work): logging.debug( "Creating overlay mount: target=%s, lower=%s, upper=%s, work=%s", mount, lower, upper, work, ) def escape(s): """ Safely encode a string for being used as a path for overlayfs. In addition to escaping ",", which separates mount options, we need to escape ":", which overlayfs uses to separate multiple lower dirs (cf. https://www.kernel.org/doc/Documentation/filesystems/overlayfs.txt). """ return s.replace(b"\\", rb"\\").replace(b":", rb"\:").replace(b",", rb"\,") libc.mount( b"none", mount, b"overlay", 0, b"lowerdir=" + escape(lower) + b",upperdir=" + escape(upper) + b",workdir=" + escape(work), )
def _setup_container_filesystem(temp_dir, dir_modes, container_system_config): # We put all temp files on a RAM disk libc.mount(None, temp_dir, b"tmpfs", 0, b"size=100%") mount_base = os.path.join(temp_dir, b"mount") # base dir for container mounts temp_base = os.path.join(temp_dir, b"temp") # upper layer for overlayfs work_base = os.path.join(temp_dir, b"overlayfs") # work dir for overlayfs os.mkdir(mount_base) os.mkdir(temp_base) os.mkdir(work_base) container.duplicate_mount_hierarchy(mount_base, temp_base, work_base, dir_modes) def make_tmpfs_dir(path): """Ensure that a tmpfs is mounted on path, if the path exists""" if path in dir_modes: return # explicitly configured by user mount_tmpfs = mount_base + path if os.path.isdir(mount_tmpfs): temp_tmpfs = temp_base + path util.makedirs(temp_tmpfs, exist_ok=True) container.make_bind_mount(temp_tmpfs, mount_tmpfs) make_tmpfs_dir(b"/dev/shm") make_tmpfs_dir(b"/run/shm") if container_system_config: container.setup_container_system_config(temp_base, mount_base, dir_modes) cwd = os.getcwd() container.chroot(mount_base) os.chdir(cwd)
def remount_with_additional_flags(mountpoint, existing_options, mountflags): """Remount an existing mount point with additional flags. @param mountpoint: the mount point as bytes @param existing_options: dict with current mount existing_options as bytes @param mountflags: int with additional mount existing_options (cf. libc.MS_* constants) """ mountflags |= libc.MS_REMOUNT|libc.MS_BIND for option, flag in libc.MOUNT_FLAGS.items(): if option in existing_options: mountflags |= flag libc.mount(None, mountpoint, None, mountflags, None)
def mount_proc(container_system_config): """Mount the /proc filesystem. @param container_system_config: Whether to mount container-specific files in /proc""" # We keep a reference to the outer /proc somewhere else because we need it # to convert our PID between the namespaces. libc.mount(b"proc", b"/proc", b"proc", 0, None) # lxcfs provides container-aware versions of several /proc files, e.g. /proc/uptime # If lxcfs is available, bind-mount these files over the kernel-provided files. if container_system_config and os.access(LXCFS_PROC_DIR, os.R_OK): for f in os.listdir(LXCFS_PROC_DIR): make_bind_mount( os.path.join(LXCFS_PROC_DIR, f), os.path.join(b"/proc", f), private=True)
def make_tmpfs_dir(path): """Ensure that a tmpfs is mounted on path, if the path exists""" if path in self._dir_modes: return # explicitly configured by user mount_tmpfs = mount_base + path temp_tmpfs = temp_base + path util.makedirs(temp_tmpfs, exist_ok=True) if os.path.isdir(mount_tmpfs): # If we already have a tmpfs, we can just bind mount it, otherwise we need one if self._container_tmpfs: container.make_bind_mount(temp_tmpfs, mount_tmpfs) else: libc.mount(None, mount_tmpfs, b"tmpfs", 0, tmpfs_opts)
def make_bind_mount(source, target, recursive=False, private=False): """Make a bind mount. @param source: the source directory as bytes @param target: the target directory as bytes @param recursive: whether to also recursively bind mount all mounts below source @param private: whether to mark the bind as private, i.e., changes to the existing mounts won't propagate and vice-versa (changes to files/dirs will still be visible). """ flags = libc.MS_BIND if recursive: flags |= libc.MS_REC if private: flags |= libc.MS_PRIVATE libc.mount(source, target, None, flags, None)
def remount_with_additional_flags(mountpoint, fstype, existing_options, mountflags): """Remount an existing mount point with additional flags. @param mountpoint: the mount point as bytes @param fstype: the file system as bytes @param existing_options: dict with current mount existing_options as bytes @param mountflags: int with additional mount existing_options (cf. libc.MS_* constants) """ mountflags |= libc.MS_REMOUNT | libc.MS_BIND for option, flag in libc.MOUNT_FLAGS.items(): if option in existing_options: mountflags |= flag if fstype == b"sysfs": # Always mount sysfs with these options. # This won't hurt and seems to be necessary in edge cases like #749. mountflags |= libc.MS_NODEV | libc.MS_NOSUID | libc.MS_NOEXEC libc.mount(None, mountpoint, None, mountflags, None)
def mount_proc(): """Mount the /proc filesystem.""" # We keep a reference to the outer /proc somewhere else because we need it # to convert our PID between the namespaces. libc.mount(b"proc", b"/proc", b"proc", 0, None)
def _setup_container_filesystem(self, temp_dir, output_dir, memlimit, memory_nodes): """Setup the filesystem layout in the container. As first step, we create a copy of all existing mountpoints in mount_base, recursively, and as "private" mounts (i.e., changes to existing mountpoints afterwards won't propagate to our copy). Then we iterate over all mountpoints and change them according to the mode the user has specified (hidden, read-only, overlay, or full-access). This has do be done for each mountpoint because overlays are not recursive. Then we chroot into the new mount hierarchy. The new filesystem layout still has a view of the host's /proc. We do not mount a fresh /proc here because the grandchild still needs the old /proc. We do simply iterate over all existing mount points and set them to read-only/overlay them, because it is easier to create a new hierarchy and chroot into it. First, we still have access to the original mountpoints while doing so, and second, we avoid race conditions if someone else changes the existing mountpoints. @param temp_dir: The base directory under which all our directories should be created. """ # All strings here are bytes to avoid issues # if existing mountpoints are invalid UTF-8. # directory with files created by tool temp_base = self._get_result_files_base(temp_dir).encode() temp_dir = temp_dir.encode() tmpfs_opts = ["size=" + str(memlimit or "100%")] if memory_nodes: tmpfs_opts.append("mpol=bind:" + ",".join(map(str, memory_nodes))) tmpfs_opts = (",".join(tmpfs_opts)).encode() if self._container_tmpfs: libc.mount(None, temp_dir, b"tmpfs", 0, tmpfs_opts) mount_base = os.path.join(temp_dir, b"mount") # base dir for container mounts os.mkdir(mount_base) os.mkdir(temp_base) # Overlayfs needs its own additional temporary directory ("work" directory). # temp_base will be the "upper" layer, the host FS the "lower" layer, # and mount_base the mount target. work_base = os.path.join(temp_dir, b"overlayfs") os.mkdir(work_base) # Copy all mounts to mount_base and apply directory modes container.duplicate_mount_hierarchy(mount_base, temp_base, work_base, self._dir_modes) # Now configure some special hard-coded cases def make_tmpfs_dir(path): """Ensure that a tmpfs is mounted on path, if the path exists""" if path in self._dir_modes: return # explicitly configured by user mount_tmpfs = mount_base + path temp_tmpfs = temp_base + path os.makedirs(temp_tmpfs, exist_ok=True) if os.path.isdir(mount_tmpfs): # If we already have a tmpfs, we can just bind mount it, # otherwise we need one if self._container_tmpfs: container.make_bind_mount(temp_tmpfs, mount_tmpfs) else: libc.mount(None, mount_tmpfs, b"tmpfs", 0, tmpfs_opts) # The following directories should be writable RAM disks # for Posix shared memory. For example, the Python multiprocessing module # explicitly checks for a tmpfs instance. make_tmpfs_dir(b"/dev/shm") make_tmpfs_dir(b"/run/shm") if self._container_system_config: container.setup_container_system_config(temp_base, mount_base, self._dir_modes) if output_dir: # We need a way to see temp_base in the container in order to be able to # copy result files out of it, so we need a directory that is guaranteed to # exist in order to use it as mountpoint for a bind mount to temp_base. # Of course, the tool inside the container should not have access to # temp_base, so we will add another bind mount with an empty directory on # top (equivalent to --hidden-dir). After the tool terminates we can unmount # the top-level bind mount and then access temp_base. However, this works # only if there is no other mount point below that directory, and the user # can force us to create mount points at arbitrary directory if a directory # mode is specified. So we need an existing directory with no mount points # below, and luckily temp_dir fulfills all requirements (because we have # just created it as fresh drectory ourselves). # So we mount temp_base outside of the container to temp_dir inside. os.makedirs(mount_base + temp_dir, exist_ok=True) container.make_bind_mount(temp_base, mount_base + temp_dir, read_only=True) # And the following if branch will automatically hide the bind # mount below an empty directory. # If necessary, (i.e., if /tmp is not already hidden), # hide the directory where we store our files from processes in the container # by mounting an empty directory over it. if os.path.exists(mount_base + temp_dir): os.makedirs(temp_base + temp_dir, exist_ok=True) container.make_bind_mount(temp_base + temp_dir, mount_base + temp_dir) # Now we make mount_base the new root directory. container.chroot(mount_base)
def make_overlay_mount(mount, lower, upper, work): logging.debug("Creating overlay mount: target=%s, lower=%s, upper=%s, work=%s", mount, lower, upper, work) libc.mount(b"none", mount, b"overlay", 0, b"lowerdir=" + lower + b",upperdir=" + upper + b",workdir=" + work)
def _setup_container_filesystem(self, temp_dir, output_dir, memlimit, memory_nodes): """Setup the filesystem layout in the container. As first step, we create a copy of all existing mountpoints in mount_base, recursively, and as "private" mounts (i.e., changes to existing mountpoints afterwards won't propagate to our copy). Then we iterate over all mountpoints and change them according to the mode the user has specified (hidden, read-only, overlay, or full-access). This has do be done for each mountpoint because overlays are not recursive. Then we chroot into the new mount hierarchy. The new filesystem layout still has a view of the host's /proc. We do not mount a fresh /proc here because the grandchild still needs the old /proc. We do simply iterate over all existing mount points and set them to read-only/overlay them, because it is easier to create a new hierarchy and chroot into it. First, we still have access to the original mountpoints while doing so, and second, we avoid race conditions if someone else changes the existing mountpoints. @param temp_dir: The base directory under which all our directories should be created. """ # All strings here are bytes to avoid issues if existing mountpoints are invalid UTF-8. temp_base = self._get_result_files_base(temp_dir).encode() # directory with files created by tool temp_dir = temp_dir.encode() tmpfs_opts = ["size=" + str(memlimit or "100%")] if memory_nodes: tmpfs_opts.append("mpol=bind:" + ",".join(map(str, memory_nodes))) tmpfs_opts = (",".join(tmpfs_opts)).encode() if self._container_tmpfs: libc.mount(None, temp_dir, b"tmpfs", 0, tmpfs_opts) mount_base = os.path.join(temp_dir, b"mount") # base dir for container mounts os.mkdir(mount_base) os.mkdir(temp_base) def _is_below(path, target_path): # compare with trailing slashes for cases like /foo and /foobar path = os.path.join(path, b"") target_path = os.path.join(target_path, b"") return path.startswith(target_path) def find_mode_for_dir(path, fstype=None): if (path == b"/proc"): # /proc is necessary for the grandchild to read PID, will be replaced later. return DIR_READ_ONLY if _is_below(path, b"/proc"): # Irrelevant. return None parent_mode = None result_mode = None for special_dir, mode in self._dir_modes.items(): if _is_below(path, special_dir): if path != special_dir: parent_mode = mode result_mode = mode assert result_mode is not None if result_mode == DIR_OVERLAY and ( _is_below(path, b"/dev") or _is_below(path, b"/sys") or fstype == b"cgroup"): # Overlay does not make sense for /dev, /sys, and all cgroups. return DIR_READ_ONLY if result_mode == DIR_OVERLAY and ( fstype == b"autofs" or fstype == b"vfat" or fstype == b"ntfs"): # Overlayfs does not support these as underlying file systems. logging.debug("Cannot use overlay mode for %s because it has file system %s. " "Using read-only mode instead.", path.decode(), fstype.decode()) return DIR_READ_ONLY if result_mode == DIR_HIDDEN and parent_mode == DIR_HIDDEN: # No need to recursively recreate mountpoints in hidden dirs. return None return result_mode # Overlayfs needs its own additional temporary directory ("work" directory). # temp_base will be the "upper" layer, the host FS the "lower" layer, # and mount_base the mount target. work_base = os.path.join(temp_dir, b"overlayfs") os.mkdir(work_base) # Create a copy of host's mountpoints. # Setting MS_PRIVATE flag discouples our mount namespace from the hosts's, # i.e., mounts we do are not seen by the host, and any (un)mounts the host does afterward # are not seen by us. The latter is desired such that new mounts (e.g., # USB sticks being plugged in) do not appear in the container. # Blocking host-side unmounts from being propagated has the disadvantage # that any unmounts done by the sysadmin won't really unmount the device # because it stays mounted in the container and thus keep the device busy # (cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=739593#85). # We could allow unmounts being propated with MS_SLAVE instead of MS_PRIVATE, # but we prefer to have the mount namespace of the container being # unchanged during run execution. container.make_bind_mount(b"/", mount_base, recursive=True, private=True) # Ensure each special dir is a mountpoint such that the next loop covers it. for special_dir in self._dir_modes.keys(): mount_path = mount_base + special_dir temp_path = temp_base + special_dir try: container.make_bind_mount(mount_path, mount_path) except OSError as e: # on btrfs, non-recursive bind mounts faitl if e.errno == errno.EINVAL: try: container.make_bind_mount(mount_path, mount_path, recursive=True) except OSError as e2: logging.debug("Failed to make %s a (recursive) bind mount: %s", mount_path, e2) else: logging.debug("Failed to make %s a bind mount: %s", mount_path, e) if not os.path.exists(temp_path): os.makedirs(temp_path) # Set desired access mode for each mountpoint. for unused_source, full_mountpoint, fstype, options in list(container.get_mount_points()): if not _is_below(full_mountpoint, mount_base): continue mountpoint = full_mountpoint[len(mount_base):] or b"/" mode = find_mode_for_dir(mountpoint, fstype) if not mode: continue if not os.access(os.path.dirname(mountpoint), os.X_OK): # If parent is not accessible we cannot mount something on mountpoint. # We mark the inaccessible directory as hidden because otherwise the mountpoint # could become accessible (directly!) if the permissions on the parent # are relaxed during container execution. original_mountpoint = mountpoint parent = os.path.dirname(mountpoint) while not os.access(parent, os.X_OK): mountpoint = parent parent = os.path.dirname(mountpoint) mode = DIR_HIDDEN logging.debug( "Marking inaccessible directory '%s' as hidden " "because it contains a mountpoint at '%s'", mountpoint.decode(), original_mountpoint.decode()) else: logging.debug("Mounting '%s' as %s", mountpoint.decode(), mode) mount_path = mount_base + mountpoint temp_path = temp_base + mountpoint work_path = work_base + mountpoint if mode == DIR_OVERLAY: if not os.path.exists(temp_path): os.makedirs(temp_path) if not os.path.exists(work_path): os.makedirs(work_path) try: # Previous mount in this place not needed if replaced with overlay dir. libc.umount(mount_path) except OSError as e: logging.debug(e) try: container.make_overlay_mount(mount_path, mountpoint, temp_path, work_path) except OSError as e: raise OSError(e.errno, "Creating overlay mount for '{}' failed: {}. " "Please use other directory modes." .format(mountpoint.decode(), os.strerror(e.errno))) elif mode == DIR_HIDDEN: if not os.path.exists(temp_path): os.makedirs(temp_path) try: # Previous mount in this place not needed if replaced with hidden dir. libc.umount(mount_path) except OSError as e: logging.debug(e) container.make_bind_mount(temp_path, mount_path) elif mode == DIR_READ_ONLY: try: container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY) except OSError as e: if e.errno == errno.EACCES: logging.warning( "Cannot mount '%s', directory may be missing from container.", mountpoint.decode()) else: # If this mountpoint is below an overlay/hidden dir re-create mountpoint. # Linux does not support making read-only bind mounts in one step: # https://lwn.net/Articles/281157/ http://man7.org/linux/man-pages/man8/mount.8.html container.make_bind_mount( mountpoint, mount_path, recursive=True, private=True) container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY) elif mode == DIR_FULL_ACCESS: try: # Ensure directory is still a mountpoint by attempting to remount. container.remount_with_additional_flags(mount_path, options, 0) except OSError as e: if e.errno == errno.EACCES: logging.warning( "Cannot mount '%s', directory may be missing from container.", mountpoint.decode()) else: # If this mountpoint is below an overlay/hidden dir re-create mountpoint. container.make_bind_mount( mountpoint, mount_path, recursive=True, private=True) else: assert False # Now configure some special hard-coded cases def make_tmpfs_dir(path): """Ensure that a tmpfs is mounted on path, if the path exists""" if path in self._dir_modes: return # explicitly configured by user mount_tmpfs = mount_base + path temp_tmpfs = temp_base + path util.makedirs(temp_tmpfs, exist_ok=True) if os.path.isdir(mount_tmpfs): # If we already have a tmpfs, we can just bind mount it, otherwise we need one if self._container_tmpfs: container.make_bind_mount(temp_tmpfs, mount_tmpfs) else: libc.mount(None, mount_tmpfs, b"tmpfs", 0, tmpfs_opts) # The following directories should be writable RAM disks for Posix shared memory. # For example, the Python multiprocessing module explicitly checks for a tmpfs instance. make_tmpfs_dir(b"/dev/shm") make_tmpfs_dir(b"/run/shm") if self._container_system_config: # If overlayfs is not used for /etc, we need additional bind mounts # for files in /etc that we want to override, like /etc/passwd config_mount_base = mount_base if find_mode_for_dir(b"/etc") != DIR_OVERLAY else None container.setup_container_system_config(temp_base, config_mount_base ) if output_dir: # We need a way to see temp_base in the container in order to be able to copy result # files out of it, so we need a directory that is guaranteed to exist in order to use # it as mountpoint for a bind mount to temp_base. # Of course, the tool inside the container should not have access to temp_base, # so we will add another bind mount with an empty directory on top # (equivalent to --hidden-dir). After the tool terminates we can unmount # the top-level bind mount and then access temp_base. However, this works only # if there is no other mount point below that directory, and the user can force us # to create mount points at arbitrary directory if a directory mode is specified. # So we need an existing directory with no mount points below, and luckily temp_dir # fulfills all requirements (because we have just created it as fresh drectory ourselves). # So we mount temp_base outside of the container to temp_dir inside. util.makedirs(mount_base + temp_dir, exist_ok=True) container.make_bind_mount(temp_base, mount_base + temp_dir, read_only=True) # And the following if branch will automatically hide the bind # mount below an empty directory. # If necessary, (i.e., if /tmp is not already hidden), # hide the directory where we store our files from processes in the container # by mounting an empty directory over it. if os.path.exists(mount_base + temp_dir): util.makedirs(temp_base + temp_dir, exist_ok=True) container.make_bind_mount(temp_base + temp_dir, mount_base + temp_dir) os.chroot(mount_base)