def test_exitcode(): q = mp_context.Queue() proc = AsyncProcess(target=exit, kwargs={'q': q}) proc.daemon = True assert not proc.is_alive() assert proc.exitcode is None yield proc.start() assert proc.is_alive() assert proc.exitcode is None q.put(5) yield proc.join(timeout=3.0) assert not proc.is_alive() assert proc.exitcode == 5
def test_terminate(): proc = AsyncProcess(target=wait) proc.daemon = True yield proc.start() yield proc.terminate() yield proc.join(timeout=3.0) assert not proc.is_alive() assert proc.exitcode in (-signal.SIGTERM, 255)
async def test_exit_callback(): to_child = mp_context.Queue() from_child = mp_context.Queue() evt = Event() # FIXME: this breaks if changed to async def... @gen.coroutine def on_stop(_proc): assert _proc is proc yield gen.moment evt.set() # Normal process exit proc = AsyncProcess(target=feed, args=(to_child, from_child)) evt.clear() proc.set_exit_callback(on_stop) proc.daemon = True await proc.start() await asyncio.sleep(0.05) assert proc.is_alive() assert not evt.is_set() to_child.put(None) await evt.wait(timedelta(seconds=3)) assert evt.is_set() assert not proc.is_alive() # Process terminated proc = AsyncProcess(target=wait) evt.clear() proc.set_exit_callback(on_stop) proc.daemon = True await proc.start() await asyncio.sleep(0.05) assert proc.is_alive() assert not evt.is_set() await proc.terminate() await evt.wait(timedelta(seconds=3)) assert evt.is_set()
def test_signal(): proc = AsyncProcess(target=exit_with_signal, args=(signal.SIGINT, )) proc.daemon = True assert not proc.is_alive() assert proc.exitcode is None yield proc.start() yield proc.join(timeout=3.0) assert not proc.is_alive() # Can be 255 with forkserver, see https://bugs.python.org/issue30589 assert proc.exitcode in (-signal.SIGINT, 255) proc = AsyncProcess(target=wait) yield proc.start() os.kill(proc.pid, signal.SIGTERM) yield proc.join(timeout=3.0) assert not proc.is_alive() assert proc.exitcode in (-signal.SIGTERM, 255)
def test_signal(): proc = AsyncProcess(target=exit_with_signal, args=(signal.SIGINT,)) proc.daemon = True assert not proc.is_alive() assert proc.exitcode is None yield proc.start() yield proc.join(timeout=3.0) assert not proc.is_alive() # Can be 255 with forkserver, see https://bugs.python.org/issue30589 assert proc.exitcode in (-signal.SIGINT, 255) proc = AsyncProcess(target=wait) yield proc.start() os.kill(proc.pid, signal.SIGTERM) yield proc.join(timeout=3.0) assert not proc.is_alive() assert proc.exitcode in (-signal.SIGTERM, 255)
def test_exit_callback(): to_child = mp_context.Queue() from_child = mp_context.Queue() evt = Event() @gen.coroutine def on_stop(_proc): assert _proc is proc yield gen.moment evt.set() # Normal process exit proc = AsyncProcess(target=feed, args=(to_child, from_child)) evt.clear() proc.set_exit_callback(on_stop) proc.daemon = True yield proc.start() yield gen.sleep(0.05) assert proc.is_alive() assert not evt.is_set() to_child.put(None) yield evt.wait(timedelta(seconds=3)) assert evt.is_set() assert not proc.is_alive() # Process terminated proc = AsyncProcess(target=wait) evt.clear() proc.set_exit_callback(on_stop) proc.daemon = True yield proc.start() yield gen.sleep(0.05) assert proc.is_alive() assert not evt.is_set() yield proc.terminate() yield evt.wait(timedelta(seconds=3)) assert evt.is_set()
async def test_num_fds(): # Warm up proc = AsyncProcess(target=exit_now) proc.daemon = True await proc.start() await proc.join() p = psutil.Process() before = p.num_fds() proc = AsyncProcess(target=exit_now) proc.daemon = True await proc.start() await proc.join() assert not proc.is_alive() assert proc.exitcode == 0 while p.num_fds() > before: await asyncio.sleep(0.01)
def test_num_fds(): psutil = pytest.importorskip('psutil') # Warm up proc = AsyncProcess(target=exit_now) proc.daemon = True yield proc.start() yield proc.join() p = psutil.Process() before = p.num_fds() proc = AsyncProcess(target=exit_now) proc.daemon = True yield proc.start() yield proc.join() assert not proc.is_alive() assert proc.exitcode == 0 start = time() while p.num_fds() > before: yield gen.sleep(0.1) print("fds:", before, p.num_fds()) assert time() < start + 10
def test_simple(): to_child = mp_context.Queue() from_child = mp_context.Queue() proc = AsyncProcess(target=feed, args=(to_child, from_child)) assert not proc.is_alive() assert proc.pid is None assert proc.exitcode is None assert not proc.daemon proc.daemon = True assert proc.daemon wr1 = weakref.ref(proc) wr2 = weakref.ref(proc._process) # join() before start() with pytest.raises(AssertionError): yield proc.join() yield proc.start() assert proc.is_alive() assert proc.pid is not None assert proc.exitcode is None t1 = time() yield proc.join(timeout=0.02) dt = time() - t1 assert 0.2 >= dt >= 0.01 assert proc.is_alive() assert proc.pid is not None assert proc.exitcode is None # setting daemon attribute after start() with pytest.raises(AssertionError): proc.daemon = False to_child.put(5) assert from_child.get() == 5 # child should be stopping now t1 = time() yield proc.join(timeout=10) dt = time() - t1 assert dt <= 1.0 assert not proc.is_alive() assert proc.pid is not None assert proc.exitcode == 0 # join() again t1 = time() yield proc.join() dt = time() - t1 assert dt <= 0.6 del proc gc.collect() if wr1() is not None: # Help diagnosing from types import FrameType p = wr1() if p is not None: rc = sys.getrefcount(p) refs = gc.get_referrers(p) del p print("refs to proc:", rc, refs) frames = [r for r in refs if isinstance(r, FrameType)] for i, f in enumerate(frames): print("frames #%d:" % i, f.f_code.co_name, f.f_code.co_filename, sorted(f.f_locals)) pytest.fail("AsyncProcess should have been destroyed") t1 = time() while wr2() is not None: yield gen.sleep(0.01) gc.collect() dt = time() - t1 assert dt < 2.0
class WorkerProcess: running: asyncio.Event stopped: asyncio.Event # The interval how often to check the msg queue for init _init_msg_interval = 0.05 def __init__( self, worker_kwargs, worker_start_args, silence_logs, on_exit, worker, env, config, ): self.status = Status.init self.silence_logs = silence_logs self.worker_kwargs = worker_kwargs self.worker_start_args = worker_start_args self.on_exit = on_exit self.process = None self.Worker = worker self.env = env self.config = config # Initialized when worker is ready self.worker_dir = None self.worker_address = None async def start(self) -> Status: """ Ensure the worker process is started. """ enable_proctitle_on_children() if self.status == Status.running: return self.status if self.status == Status.starting: await self.running.wait() return self.status self.init_result_q = init_q = mp_context.Queue() self.child_stop_q = mp_context.Queue() uid = uuid.uuid4().hex self.process = AsyncProcess( target=self._run, name="Dask Worker process (from Nanny)", kwargs=dict( worker_kwargs=self.worker_kwargs, worker_start_args=self.worker_start_args, silence_logs=self.silence_logs, init_result_q=self.init_result_q, child_stop_q=self.child_stop_q, uid=uid, Worker=self.Worker, env=self.env, config=self.config, ), ) self.process.daemon = dask.config.get("distributed.worker.daemon", default=True) self.process.set_exit_callback(self._on_exit) self.running = asyncio.Event() self.stopped = asyncio.Event() self.status = Status.starting try: await self.process.start() except OSError: logger.exception("Nanny failed to start process", exc_info=True) self.process.terminate() self.status = Status.failed return self.status try: msg = await self._wait_until_connected(uid) except Exception: self.status = Status.failed self.process.terminate() raise if not msg: return self.status self.worker_address = msg["address"] self.worker_dir = msg["dir"] assert self.worker_address self.status = Status.running self.running.set() init_q.close() return self.status def _on_exit(self, proc): if proc is not self.process: # Ignore exit of old process instance return self.mark_stopped() def _death_message(self, pid, exitcode): assert exitcode is not None if exitcode == 255: return "Worker process %d was killed by unknown signal" % (pid, ) elif exitcode >= 0: return "Worker process %d exited with status %d" % (pid, exitcode) else: return "Worker process %d was killed by signal %d" % (pid, -exitcode) def is_alive(self): return self.process is not None and self.process.is_alive() @property def pid(self): return self.process.pid if self.process and self.process.is_alive( ) else None def mark_stopped(self): if self.status != Status.stopped: r = self.process.exitcode assert r is not None if r != 0: msg = self._death_message(self.process.pid, r) logger.info(msg) self.status = Status.stopped self.stopped.set() # Release resources self.process.close() self.init_result_q = None self.child_stop_q = None self.process = None # Best effort to clean up worker directory if self.worker_dir and os.path.exists(self.worker_dir): shutil.rmtree(self.worker_dir, ignore_errors=True) self.worker_dir = None # User hook if self.on_exit is not None: self.on_exit(r) async def kill(self, timeout: float = 2, executor_wait: bool = True): """ Ensure the worker process is stopped, waiting at most *timeout* seconds before terminating it abruptly. """ deadline = time() + timeout if self.status == Status.stopped: return if self.status == Status.stopping: await self.stopped.wait() return assert self.status in (Status.starting, Status.running) self.status = Status.stopping process = self.process self.child_stop_q.put({ "op": "stop", "timeout": max(0, deadline - time()) * 0.8, "executor_wait": executor_wait, }) await asyncio.sleep(0) # otherwise we get broken pipe errors self.child_stop_q.close() while process.is_alive() and time() < deadline: await asyncio.sleep(0.05) if process.is_alive(): logger.warning( f"Worker process still alive after {timeout} seconds, killing") try: await process.terminate() except Exception as e: logger.error("Failed to kill worker process: %s", e) async def _wait_until_connected(self, uid): while True: if self.status != Status.starting: return # This is a multiprocessing queue and we'd block the event loop if # we simply called get try: msg = self.init_result_q.get_nowait() except Empty: await asyncio.sleep(self._init_msg_interval) continue if msg["uid"] != uid: # ensure that we didn't cross queues continue if "exception" in msg: logger.error("Failed while trying to start worker process: %s", msg["exception"]) raise msg["exception"] else: return msg @classmethod def _run( cls, worker_kwargs, worker_start_args, silence_logs, init_result_q, child_stop_q, uid, env, config, Worker, ): # pragma: no cover try: os.environ.update(env) dask.config.set(config) try: from dask.multiprocessing import initialize_worker_process except ImportError: # old Dask version pass else: initialize_worker_process() if silence_logs: logger.setLevel(silence_logs) IOLoop.clear_instance() loop = IOLoop() loop.make_current() worker = Worker(**worker_kwargs) async def do_stop(timeout=5, executor_wait=True): try: await worker.close( report=True, nanny=False, safe=True, # TODO: Graceful or not? executor_wait=executor_wait, timeout=timeout, ) finally: loop.stop() def watch_stop_q(): """ Wait for an incoming stop message and then stop the worker cleanly. """ msg = child_stop_q.get() child_stop_q.close() assert msg.pop("op") == "stop" loop.add_callback(do_stop, **msg) t = threading.Thread(target=watch_stop_q, name="Nanny stop queue watch") t.daemon = True t.start() async def run(): """ Try to start worker and inform parent of outcome. """ try: await worker except Exception as e: logger.exception("Failed to start worker") init_result_q.put({"uid": uid, "exception": e}) init_result_q.close() # If we hit an exception here we need to wait for a least # one interval for the outside to pick up this message. # Otherwise we arrive in a race condition where the process # cleanup wipes the queue before the exception can be # properly handled. See also # WorkerProcess._wait_until_connected (the 2 is for good # measure) sync_sleep(cls._init_msg_interval * 2) else: try: assert worker.address except ValueError: pass else: init_result_q.put({ "address": worker.address, "dir": worker.local_directory, "uid": uid, }) init_result_q.close() await worker.finished() logger.info("Worker closed") except Exception as e: logger.exception("Failed to initialize Worker") init_result_q.put({"uid": uid, "exception": e}) init_result_q.close() # If we hit an exception here we need to wait for a least one # interval for the outside to pick up this message. Otherwise we # arrive in a race condition where the process cleanup wipes the # queue before the exception can be properly handled. See also # WorkerProcess._wait_until_connected (the 2 is for good measure) sync_sleep(cls._init_msg_interval * 2) else: try: loop.run_sync(run) except (TimeoutError, gen.TimeoutError): # Loop was stopped before wait_until_closed() returned, ignore pass except KeyboardInterrupt: # At this point the loop is not running thus we have to run # do_stop() explicitly. loop.run_sync(do_stop)
def test_simple(): to_child = mp_context.Queue() from_child = mp_context.Queue() proc = AsyncProcess(target=feed, args=(to_child, from_child)) assert not proc.is_alive() assert proc.pid is None assert proc.exitcode is None assert not proc.daemon proc.daemon = True assert proc.daemon wr1 = weakref.ref(proc) wr2 = weakref.ref(proc._process) # join() before start() with pytest.raises(AssertionError): yield proc.join() yield proc.start() assert proc.is_alive() assert proc.pid is not None assert proc.exitcode is None t1 = time() yield proc.join(timeout=0.02) dt = time() - t1 assert 0.2 >= dt >= 0.01 assert proc.is_alive() assert proc.pid is not None assert proc.exitcode is None # setting daemon attribute after start() with pytest.raises(AssertionError): proc.daemon = False to_child.put(5) assert from_child.get() == 5 # child should be stopping now t1 = time() yield proc.join(timeout=10) dt = time() - t1 assert dt <= 1.0 assert not proc.is_alive() assert proc.pid is not None assert proc.exitcode == 0 # join() again t1 = time() yield proc.join() dt = time() - t1 assert dt <= 0.6 del proc gc.collect() start = time() while wr1() is not None and time() < start + 1: # Perhaps the GIL switched before _watch_process() exit, # help it a little sleep(0.001) gc.collect() if wr1() is not None: # Help diagnosing from types import FrameType p = wr1() if p is not None: rc = sys.getrefcount(p) refs = gc.get_referrers(p) del p print("refs to proc:", rc, refs) frames = [r for r in refs if isinstance(r, FrameType)] for i, f in enumerate(frames): print("frames #%d:" % i, f.f_code.co_name, f.f_code.co_filename, sorted(f.f_locals)) pytest.fail("AsyncProcess should have been destroyed") t1 = time() while wr2() is not None: yield gen.sleep(0.01) gc.collect() dt = time() - t1 assert dt < 2.0