def test_reap_process_group(self): """ Spin up a process that can't be killed by SIGTERM and make sure it gets killed anyway. """ parent_setup_done = multiprocessing.Semaphore(0) parent_pid = multiprocessing.Value('i', 0) child_pid = multiprocessing.Value('i', 0) args = [parent_pid, child_pid, parent_setup_done] parent = multiprocessing.Process(target=TestReapProcessGroup._parent_of_ignores_sigterm, args=args) try: parent.start() self.assertTrue(parent_setup_done.acquire(timeout=5.0)) self.assertTrue(psutil.pid_exists(parent_pid.value)) self.assertTrue(psutil.pid_exists(child_pid.value)) process_utils.reap_process_group(parent_pid.value, logging.getLogger(), timeout=1) self.assertFalse(psutil.pid_exists(parent_pid.value)) self.assertFalse(psutil.pid_exists(child_pid.value)) finally: try: os.kill(parent_pid.value, signal.SIGKILL) # terminate doesnt work here os.kill(child_pid.value, signal.SIGKILL) # terminate doesnt work here except OSError: pass
def _heartbeat_manager(self): """Heartbeat DAG file processor and restart it if we are not done.""" if not self._parent_signal_conn: raise ValueError("Process not started.") if self._process and not self._process.is_alive(): self._process.join(timeout=0) if not self.done: self.log.warning( "DagFileProcessorManager (PID=%d) exited with exit code %d - re-launching", self._process.pid, self._process.exitcode, ) self.start() if self.done: return parsing_stat_age = time.monotonic() - self._last_parsing_stat_received_at if parsing_stat_age > self._processor_timeout.total_seconds(): Stats.incr('dag_processing.manager_stalls') self.log.error( "DagFileProcessorManager (PID=%d) last sent a heartbeat %.2f seconds ago! Restarting it", self._process.pid, parsing_stat_age, ) reap_process_group(self._process.pid, logger=self.log) self.start()
def end(self): """ Terminate (and then kill) the manager process launched. :return: """ if not self._process: self.log.warning('Ending without manager process.') return reap_process_group(self._process.pid, logger=self.log) self._parent_signal_conn.close()
def end(self): """ Terminate (and then kill) the manager process launched. :return: """ if not self._process: self.log.warning('Ending without manager process.') return # Give the Manager some time to cleanly shut down, but not too long, as # it's better to finish sooner than wait for (non-critical) work to # finish self._process.join(timeout=1.0) reap_process_group(self._process.pid, logger=self.log) self._parent_signal_conn.close()
def terminate(self): if self.process is None: return if self.process.is_running(): rcs = reap_process_group(self.process.pid, self.log) self._rc = rcs.get(self.process.pid) self.process = None if self._rc is None: # Something else reaped it before we had a chance, so let's just "guess" at an error code. self._rc = -9
def terminate(self): if self.process is None: return # Reap the child process - it may already be finished _ = self.return_code(timeout=0) if self.process and self.process.is_running(): rcs = reap_process_group(self.process.pid, self.log) self._rc = rcs.get(self.process.pid) self.process = None if self._rc is None: # Something else reaped it before we had a chance, so let's just "guess" at an error code. self._rc = -9
def terminate(self): if self.process is None: return # Reap the child process - it may already be finished _ = self.return_code(timeout=0) if self.process and self.process.is_running(): rcs = reap_process_group(self.process.pid, self.log) self._rc = rcs.get(self.process.pid) self.process = None if self._rc is None: # Something else reaped it before we had a chance, so let's just "guess" at an error code. self._rc = -9 if self._rc == -9: # If either we or psutil gives out a -9 return code, it likely means # an OOM happened self.log.error( 'Job %s was killed before it finished (likely due to running out of memory)', self._task_instance.job_id, )
def terminate(self): if self.process and psutil.pid_exists(self.process.pid): reap_process_group(self.process.pid, self.log)