async def clone_child_task( parent: ForkThread, flags: CLONE, trampoline_func: t.Callable[[FileDescriptor], Trampoline], ) -> t.Tuple[AsyncChildProcess, Task]: """Clone a new child process and setup the sysif and task to manage it We rely on trampoline_func to take a socket and give us a native function call with arguments that will speak the rsyscall protocol over that socket. We also create a futex process, which we use to monitor the ctid futex. This process allows us to detect when the child successfully finishes an exec; see the docstring of ChildSyscallInterface. Because we set CLONE.CHILD_CLEARTID, the ctid futex will receive a FUTEX_WAKE when the child process exits or execs, and the futex process will accordingly exit. """ # Open a channel which we'll use for the rsyscall connection [(access_sock, remote_sock)] = await parent.connection.open_async_channels(1) # Create a trampoline that will start the new process running an rsyscall server trampoline = trampoline_func(remote_sock) # Force these flags to be used flags |= CLONE.VM|CLONE.FILES|CLONE.IO|CLONE.SYSVSEM # TODO it is unclear why we sometimes need to make a new mapping here, instead of # allocating with our normal allocator; all our memory is already MAP.SHARED, I think. # We should resolve this so we can use the normal allocator. arena = Arena(await parent.task.mmap(4096*2, PROT.READ|PROT.WRITE, MAP.SHARED)) async def op(sem: RAM) -> t.Tuple[t.Tuple[Pointer[Stack], WrittenPointer[Stack]], WrittenPointer[FutexNode]]: stack_value = parent.loader.make_trampoline_stack(trampoline) stack_buf = await sem.malloc(Stack, 4096) stack = await stack_buf.write_to_end(stack_value, alignment=16) futex_pointer = await sem.ptr(FutexNode(None, Int32(0))) return stack, futex_pointer # Create the stack we'll need, and the zero-initialized futex stack, futex_pointer = await parent.ram.perform_batch(op, arena) # it's important to start the processes in this order, so that the thread # process is the first process started; this is relevant in several # situations, including unshare(NEWPID) and manipulation of ns_last_pid child_process = await parent.monitor.clone(flags|CLONE.CHILD_CLEARTID, stack, ctid=futex_pointer) futex_process = await launch_futex_monitor( parent.ram, parent.loader, parent.monitor, futex_pointer) # Create the new syscall interface, which needs to use not just the connection, # but also the child process and the futex process. syscall = ChildSyscallInterface(SyscallConnection(access_sock, access_sock), child_process, futex_process) # Set up the new task with appropriately inherited namespaces, tables, etc. # TODO correctly track all the namespaces we're in if flags & CLONE.NEWPID: pidns = far.PidNamespace(child_process.process.near.id) else: pidns = parent.task.pidns task = Task(syscall, child_process.process, parent.task.fd_table, parent.task.address_space, pidns) task.sigmask = parent.task.sigmask # Move ownership of the remote sock into the task and store it so it isn't closed remote_sock_handle = remote_sock.move(task) syscall.store_remote_side_handles(remote_sock_handle, remote_sock_handle) return child_process, task
async def _make_local_thread() -> Thread: """Create the local thread, allocating various resources locally. For the most part, the local thread is like any other thread; it just bootstraps differently, and uses syscall and memory interfaces which are specialized to the local thread. """ process = near.Process(os.getpid()) task = Task( LocalSyscall(), process, far.FDTable(process.id), far.AddressSpace(process.id), far.PidNamespace(process.id), ) ram = RAM(task, LocalMemoryTransport(task), memory.AllocatorClient.make_allocator(task)) epfd = await task.epoll_create() async def wait_readable(): logger.debug("wait_readable(%s)", epfd.near.number) await trio.hazmat.wait_readable(epfd.near.number) epoller = Epoller.make_subsidiary(ram, epfd, wait_readable) thread = Thread( task, ram, await FDPassConnection.make(task, ram, epoller), NativeLoader.make_from_symbols(task, lib), epoller, await ChildProcessMonitor.make(ram, task, epoller), Environment(task, ram, { key.encode(): value.encode() for key, value in os.environ.items() }), stdin=task.make_fd_handle(near.FileDescriptor(0)), stdout=task.make_fd_handle(near.FileDescriptor(1)), stderr=task.make_fd_handle(near.FileDescriptor(2)), ) return thread
async def _make_local_process() -> Process: """Create the local process, allocating various resources locally. For the most part, the local process is like any other process; it just bootstraps differently, and uses syscall and memory interfaces which are specialized to the local process. """ pid = near.Pid(os.getpid()) task = Task( pid, handle.FDTable(pid.id), far.AddressSpace(pid.id), far.PidNamespace(pid.id), far.MountNamespace(pid.id), ) task.sysif = LocalSyscall(task) task.allocator = await memory.AllocatorClient.make_allocator(task) epfd = await task.epoll_create() async def wait_readable(): logger.debug("wait_readable(%s)", epfd.near.number) await trio.lowlevel.wait_readable(epfd.near.number) trio_system_wait_readable = TrioSystemWaitReadable(epfd.near.number) set_trio_system_wait_readable(trio_system_wait_readable) epoller = Epoller.make_subsidiary(epfd, trio_system_wait_readable.wait) process = Process( task, await FDPassConnection.make(task, epoller), NativeLoader.make_from_symbols(task, lib), epoller, await ChildPidMonitor.make(task, epoller), Environment.make_from_environ(task, {**os.environ}), stdin=task.make_fd_handle(near.FileDescriptor(0)), stdout=task.make_fd_handle(near.FileDescriptor(1)), stderr=task.make_fd_handle(near.FileDescriptor(2)), ) return process
async def ssh_bootstrap( parent: Process, # the actual ssh command to run ssh_command: SSHCommand, # the local path we'll use for the socket local_socket_path: Path, # the directory we're bootstrapping out of tmp_path_bytes: bytes, ) -> t.Tuple[AsyncChildPid, Process]: "Over ssh, run the bootstrap executable, " # identify local path local_data_addr = await parent.ram.ptr( await SockaddrUn.from_path(parent, local_socket_path)) # start port forwarding; we'll just leak this process, no big deal # TODO we shouldn't leak processes; we should be GCing processes at some point forward_child_pid = await ssh_forward( parent, ssh_command, local_socket_path, (tmp_path_bytes + b"/data").decode()) # start bootstrap bootstrap_process = await parent.fork() bootstrap_child_pid = await bootstrap_process.exec(ssh_command.args( "-n", f"cd {tmp_path_bytes.decode()}; exec ./bootstrap rsyscall" )) # TODO should unlink the bootstrap after I'm done execing. # it would be better if sh supported fexecve, then I could unlink it before I exec... # Connect to local socket 4 times async def make_async_connection() -> AsyncFileDescriptor: sock = await parent.make_afd(await parent.socket(AF.UNIX, SOCK.STREAM|SOCK.NONBLOCK)) await sock.connect(local_data_addr) return sock async_local_syscall_sock = await make_async_connection() async_local_data_sock = await make_async_connection() # Read description off of the data sock describe_buf = AsyncReadBuffer(async_local_data_sock) describe_struct = await describe_buf.read_cffi('struct rsyscall_bootstrap') new_pid = describe_struct.pid environ = await describe_buf.read_envp(describe_struct.envp_count) # Build the new task! new_address_space = far.AddressSpace(new_pid) # TODO the pid namespace will probably be common for all connections... # TODO we should get this from the SSHHost, this is usually going # to be common for all connections and we should express that new_pid_namespace = far.PidNamespace(new_pid) new_pid = near.Pid(new_pid) new_base_task = Task( new_pid, handle.FDTable(new_pid), new_address_space, new_pid_namespace, ) handle_remote_syscall_fd = new_base_task.make_fd_handle(near.FileDescriptor(describe_struct.syscall_sock)) new_base_task.sysif = SyscallConnection( logger.getChild(str(new_pid)), async_local_syscall_sock, async_local_syscall_sock, handle_remote_syscall_fd, handle_remote_syscall_fd, ) handle_remote_data_fd = new_base_task.make_fd_handle(near.FileDescriptor(describe_struct.data_sock)) handle_listening_fd = new_base_task.make_fd_handle(near.FileDescriptor(describe_struct.listening_sock)) new_allocator = memory.AllocatorClient.make_allocator(new_base_task) new_transport = SocketMemoryTransport(async_local_data_sock, handle_remote_data_fd) # we don't inherit SignalMask; we assume ssh zeroes the sigmask before starting us new_ram = RAM(new_base_task, new_transport, new_allocator) epoller = await Epoller.make_root(new_ram, new_base_task) child_monitor = await ChildPidMonitor.make(new_ram, new_base_task, epoller) await handle_listening_fd.fcntl(F.SETFL, O.NONBLOCK) connection = ListeningConnection( parent.task, parent.ram, parent.epoller, local_data_addr, new_base_task, new_ram, await AsyncFileDescriptor.make(epoller, new_ram, handle_listening_fd), ) new_process = Process( task=new_base_task, ram=new_ram, connection=connection, loader=NativeLoader.make_from_symbols(new_base_task, describe_struct.symbols), epoller=epoller, child_monitor=child_monitor, environ=Environment.make_from_environ(new_base_task, new_ram, environ), stdin=new_base_task.make_fd_handle(near.FileDescriptor(0)), stdout=new_base_task.make_fd_handle(near.FileDescriptor(1)), stderr=new_base_task.make_fd_handle(near.FileDescriptor(2)), ) return bootstrap_child_pid, new_process
async def clone_child_task( task: Task, ram: RAM, connection: Connection, loader: NativeLoader, monitor: ChildProcessMonitor, flags: CLONE, trampoline_func: t.Callable[[FileDescriptor], Trampoline], ) -> t.Tuple[AsyncChildProcess, Task]: """Clone a new child process and setup the sysif and task to manage it We rely on trampoline_func to take a socket and give us a native function call with arguments that will speak the rsyscall protocol over that socket. We want to see EOF on our local socket if that remote socket is no longer being read; for example, if the process exits or execs. This is not automatic for us: Since the process might share its file descriptor table with other processes, remote_sock might not be closed when the process exits or execs. To ensure that we get an EOF, we use the ctid futex, which, thanks to CLONE.CHILD_CLEARTID, will be cleared and receive a futex wakeup when the child process exits or execs. When we see that futex wakeup (from Python, with the futex integrated into our event loop through launch_futex_monitor), we call shutdown(SHUT.RDWR) on the local socket from the parent. This results in future reads returning EOF. """ # These flags are mandatory; if we don't use CLONE_VM then CHILD_CLEARTID doesn't work # properly and our only other recourse to detect exec is to abuse robust futexes. flags |= CLONE.VM | CLONE.CHILD_CLEARTID # Open a channel which we'll use for the rsyscall connection [(access_sock, remote_sock)] = await connection.open_async_channels(1) # Create a trampoline that will start the new process running an rsyscall server trampoline = trampoline_func(remote_sock) # TODO it is unclear why we sometimes need to make a new mapping here, instead of # allocating with our normal allocator; all our memory is already MAP.SHARED, I think. # We should resolve this so we can use the normal allocator. arena = Arena(await task.mmap(4096 * 2, PROT.READ | PROT.WRITE, MAP.SHARED)) async def op( sem: RAM ) -> t.Tuple[t.Tuple[Pointer[Stack], WrittenPointer[Stack]], WrittenPointer[FutexNode]]: stack_value = loader.make_trampoline_stack(trampoline) stack_buf = await sem.malloc(Stack, 4096) stack = await stack_buf.write_to_end(stack_value, alignment=16) futex_pointer = await sem.ptr(FutexNode(None, Int32(1))) return stack, futex_pointer # Create the stack we'll need, and the zero-initialized futex stack, futex_pointer = await ram.perform_batch(op, arena) # it's important to start the processes in this order, so that the thread # process is the first process started; this is relevant in several # situations, including unshare(NEWPID) and manipulation of ns_last_pid child_process = await monitor.clone(flags, stack, ctid=futex_pointer) # We want to be able to rely on getting an EOF if the other side of the syscall # connection is no longer being read (e.g., if the process exits or execs). Since the # process might share its file descriptor table with other processes, remote_sock # might not be closed when the process exits or execs. To ensure that we get an EOF, # we use the ctid futex, which will be cleared on process exit or exec; we shutdown # access_sock when the ctid futex is cleared, to get an EOF. # We do this with launch_futex_monitor and a background coroutine. futex_process = await launch_futex_monitor(ram, loader, monitor, futex_pointer) async def shutdown_access_sock_on_futex_process_exit(): try: await futex_process.waitpid(W.EXITED) except SyscallError: # if the parent of the futex_process dies, this syscall # connection is broken anyway, so shut it down. pass await access_sock.handle.shutdown(SHUT.RDWR) # Running this in the background, without an associated object, is a bit dubious... reset(shutdown_access_sock_on_futex_process_exit()) # Set up the new task with appropriately inherited namespaces, tables, etc. # TODO correctly track all the namespaces we're in if flags & CLONE.NEWPID: pidns = far.PidNamespace(child_process.process.near.id) else: pidns = task.pidns if flags & CLONE.FILES: fd_table = task.fd_table else: fd_table = handle.FDTable(child_process.process.near.id, task.fd_table) child_task = Task(child_process.process, fd_table, task.address_space, pidns) child_task.sigmask = task.sigmask # Move ownership of the remote sock into the task and store it so it isn't closed remote_sock_handle = remote_sock.inherit(child_task) await remote_sock.invalidate() # Create the new syscall interface, which needs to use not just the connection, # but also the futex process. child_task.sysif = SyscallConnection( logger.getChild(str(child_process.process.near)), access_sock, access_sock, remote_sock_handle, remote_sock_handle, ) return child_process, child_task