def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.
                # According to http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python < 3.3 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc()
                container.drop_capabilities()
                container.reset_signal_handling()
                child_setup_fn()  # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == b"\0", received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
        def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.  According to
                # http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python 2 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance
                # (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc(self._container_system_config)
                container.drop_capabilities()
                container.reset_signal_handling()
                child_setup_fn()  # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID
                # and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == MARKER_PARENT_COMPLETED, received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
        def child():
            """Setup everything inside the container,
            start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs(),
                )

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file
                # descriptors, to avoid keeping other pipes and files open, e.g.,
                # those that the parent uses to communicate with other containers
                # (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually. We keep the relevant ends of our pipes,
                # and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin,
                    sys.stdout,
                    sys.stderr,
                    to_parent,
                    from_parent,
                    stdin,
                    stdout,
                    stderr,
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        socket.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished,
                    # this is necessary for filesystem writes
                    received = os.read(from_parent,
                                       len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes,
                        )

                    # Marking this process as "non-dumpable" (no core dumps) also
                    # forbids several other ways how other processes can access and
                    # influence it:
                    # ptrace is forbidden and much of /proc/<child>/ is inaccessible.
                    # We set this to prevent the benchmarked tool from messing with this
                    # process or using it to escape from the container. More info:
                    # http://man7.org/linux/man-pages/man5/proc.5.html
                    # It needs to be done after MARKER_USER_MAPPING_COMPLETED.
                    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0,
                               0, 0)
                except OSError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except OSError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                container.setup_seccomp_filter()

                try:
                    grandchild_proc = subprocess.Popen(
                        args,
                        stdin=stdin,
                        stdout=stdout,
                        stderr=stderr,
                        env=env,
                        close_fds=False,
                        preexec_fn=grandchild,
                    )
                except (OSError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = ([libc.CAP_SYS_ADMIN]
                                          if result_files_patterns else [])
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={
                    sys.stdout, sys.stderr, to_parent, from_parent
                })

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                grandchild_result = container.wait_for_child_and_forward_signals(
                    grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.",
                    args[0],
                    grandchild_result[0],
                )

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                # Re-allow access to /proc/<child>/...,
                # this is used by the parent for accessing output files
                libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0)

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is
                # finished. If the child terminates, the container file system and its
                # tmpfs go away.
                assert os.read(from_parent,
                               1) == MARKER_PARENT_POST_RUN_COMPLETED
                os.close(from_parent)

                return 0
            except OSError:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except BaseException:
                # Need to catch everything because this method always needs to return an
                # int (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR
        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug("Child: child process of RunExecutor with PID %d started",
                              container.get_my_pid_from_procfs())

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {sys.stdin, sys.stdout, sys.stderr,
                    to_parent, from_parent, stdin, stdout, stderr} - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if not self._allow_network:
                        container.activate_network_interface("lo")
                    self._setup_container_filesystem(temp_dir)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return int(e.errno)

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s", e)
                    return int(e.errno)

                try:
                    grandchild_proc = subprocess.Popen(args,
                                        stdin=stdin,
                                        stdout=stdout, stderr=stderr,
                                        env=env,
                                        close_fds=False,
                                        preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    try:
                        return int(e.errno)
                    except BaseException:
                        # subprocess.Popen in Python 2.7 throws OSError with errno=None
                        # if the preexec_fn fails.
                        return -2

                container.drop_capabilities()

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                container.forward_all_signals(grandchild_proc.pid, args[0])

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={sys.stdout, sys.stderr, to_parent})

                # wait for grandchild and return its result
                grandchild_result = self._wait_for_process(grandchild_proc.pid, args[0])
                logging.debug("Child: process %s terminated with exit code %d.",
                              args[0], grandchild_result[0])
                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return int(e.errno)
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return -1
Exemple #5
0
        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs())

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin, sys.stdout, sys.stderr, to_parent, from_parent,
                    stdin, stdout, stderr
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(temp_dir)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                try:
                    grandchild_proc = subprocess.Popen(args,
                                                       stdin=stdin,
                                                       stdout=stdout,
                                                       stderr=stderr,
                                                       env=env,
                                                       close_fds=False,
                                                       preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                container.drop_capabilities()

                # Close other fds that were still necessary above.
                container.close_open_fds(
                    keep_files={sys.stdout, sys.stderr, to_parent})

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                if _HAS_SIGWAIT:
                    grandchild_result = container.wait_for_child_and_forward_all_signals(
                        grandchild_proc.pid, args[0])
                else:
                    container.forward_all_signals_async(
                        grandchild_proc.pid, args[0])
                    grandchild_result = self._wait_for_process(
                        grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.", args[0],
                    grandchild_result[0])
                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR
Exemple #6
0
def _init_container_and_load_tool(
        tool_module,
        temp_dir,
        network_access,
        dir_modes,
        container_system_config,
        container_tmpfs,  # ignored, tmpfs is always used
):
    """Initialize container for the current process and load given tool-info module."""
    # Prepare for private home directory, some tools write there
    if container_system_config:
        dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN)
        os.environ["HOME"] = container.CONTAINER_HOME

    # Preparations
    temp_dir = temp_dir.encode()
    dir_modes = collections.OrderedDict(
        sorted(
            ((path.encode(), kind) for (path, kind) in dir_modes.items()),
            key=lambda tupl: len(tupl[0]),
        ))
    uid = container.CONTAINER_UID if container_system_config else os.getuid()
    gid = container.CONTAINER_GID if container_system_config else os.getgid()

    # Create container.
    # Contrary to ContainerExecutor, which uses clone to start a new process in new
    # namespaces, we use unshare, which puts the current process (the multiprocessing
    # worker process) into new namespaces.
    # The exception is the PID namespace, which will only apply to children processes.
    flags = (libc.CLONE_NEWNS
             | libc.CLONE_NEWUTS
             | libc.CLONE_NEWIPC
             | libc.CLONE_NEWUSER
             | libc.CLONE_NEWPID)
    if not network_access:
        flags |= libc.CLONE_NEWNET
    libc.unshare(flags)

    # Container config
    container.setup_user_mapping(os.getpid(), uid, gid)
    _setup_container_filesystem(temp_dir, dir_modes, container_system_config)
    if container_system_config:
        libc.sethostname(container.CONTAINER_HOSTNAME)
    if not network_access:
        container.activate_network_interface("lo")

    # Because this process is not actually in the new PID namespace, we fork.
    # The child will be in the new PID namespace and will assume the role of the acting
    # multiprocessing worker (which it can do because it inherits the file descriptors
    # that multiprocessing uses for communication).
    # The original multiprocessing worker (the parent of the fork) must do nothing in
    # order to not confuse multiprocessing.
    pid = os.fork()
    if pid:
        container.drop_capabilities()
        # block parent such that it does nothing
        os.waitpid(pid, 0)
        os._exit(0)

    # Finalize container setup in child
    container.mount_proc(container_system_config)  # only possible in child
    container.drop_capabilities()
    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0)

    logging.debug("Loading tool-info module %s in container", tool_module)
    global tool
    try:
        tool = __import__(tool_module, fromlist=["Tool"]).Tool()
    except BaseException as e:
        tool = None
        return e
    return tool.__doc__
def _init_container(
    temp_dir,
    network_access,
    dir_modes,
    container_system_config,
    container_tmpfs,  # ignored, tmpfs is always used
):
    """
    Create a fork of this process in a container. This method only returns in the fork,
    so calling it seems like moving the current process into a container.
    """
    # Prepare for private home directory, some tools write there
    if container_system_config:
        dir_modes.setdefault(container.CONTAINER_HOME, container.DIR_HIDDEN)
        os.environ["HOME"] = container.CONTAINER_HOME

    # Preparations
    temp_dir = temp_dir.encode()
    dir_modes = collections.OrderedDict(
        sorted(
            ((path.encode(), kind) for (path, kind) in dir_modes.items()),
            key=lambda tupl: len(tupl[0]),
        )
    )
    uid = container.CONTAINER_UID if container_system_config else os.getuid()
    gid = container.CONTAINER_GID if container_system_config else os.getgid()

    # Create container.
    # Contrary to ContainerExecutor, which uses clone to start a new process in new
    # namespaces, we use unshare, which puts the current process (the multiprocessing
    # worker process) into new namespaces.
    # The exception is the PID namespace, which will only apply to children processes.
    flags = (
        libc.CLONE_NEWNS
        | libc.CLONE_NEWUTS
        | libc.CLONE_NEWIPC
        | libc.CLONE_NEWUSER
        | libc.CLONE_NEWPID
    )
    if not network_access:
        flags |= libc.CLONE_NEWNET
    try:
        libc.unshare(flags)
    except OSError as e:
        if (
            e.errno == errno.EPERM
            and util.try_read_file("/proc/sys/kernel/unprivileged_userns_clone") == "0"
        ):
            raise BenchExecException(
                "Unprivileged user namespaces forbidden on this system, please "
                "enable them with 'sysctl kernel.unprivileged_userns_clone=1' "
                "or disable container mode"
            )
        else:
            raise BenchExecException(
                "Creating namespace for container mode failed: " + os.strerror(e.errno)
            )

    # Container config
    container.setup_user_mapping(os.getpid(), uid, gid)
    _setup_container_filesystem(temp_dir, dir_modes, container_system_config)
    if container_system_config:
        libc.sethostname(container.CONTAINER_HOSTNAME)
    if not network_access:
        container.activate_network_interface("lo")

    # Because this process is not actually in the new PID namespace, we fork.
    # The child will be in the new PID namespace and will assume the role of the acting
    # multiprocessing worker (which it can do because it inherits the file descriptors
    # that multiprocessing uses for communication).
    # The original multiprocessing worker (the parent of the fork) must do nothing in
    # order to not confuse multiprocessing.
    pid = os.fork()
    if pid:
        container.drop_capabilities()
        # block parent such that it does nothing
        os.waitpid(pid, 0)
        os._exit(0)

    # Finalize container setup in child
    container.mount_proc(container_system_config)  # only possible in child
    container.drop_capabilities()
    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0, 0, 0)
    container.setup_seccomp_filter()
Exemple #8
0
        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug("Child: child process of RunExecutor with PID %d started",
                              container.get_my_pid_from_procfs())

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {sys.stdin, sys.stdout, sys.stderr,
                    to_parent, from_parent, stdin, stdout, stderr} - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        libc.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished, this is necessary for filesystem writes
                    received = os.read(from_parent, len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s", e)
                    return CHILD_OSERROR

                try:
                    grandchild_proc = subprocess.Popen(args,
                                        stdin=stdin,
                                        stdout=stdout, stderr=stderr,
                                        env=env,
                                        close_fds=False,
                                        preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = [libc.CAP_SYS_ADMIN] if result_files_patterns else []
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={sys.stdout, sys.stderr, to_parent, from_parent})

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                if _HAS_SIGWAIT:
                    grandchild_result = container.wait_for_child_and_forward_all_signals(
                        grandchild_proc.pid, args[0])
                else:
                    container.forward_all_signals_async(grandchild_proc.pid, args[0])
                    grandchild_result = self._wait_for_process(grandchild_proc.pid, args[0])

                logging.debug("Child: process %s terminated with exit code %d.",
                              args[0], grandchild_result[0])

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is finished.
                # If the child terminates, the container file system and its tmpfs go away.
                os.read(from_parent, 1)
                os.close(from_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR