class ActionPool: def __init__( self, settings=None, *, processes=None, max_active_or_pending=None, allow_abort_after=None, ): # Removed _settings because it is not required now # self.settings = _pickable_settings(settings) self.processes = processes or N_PROCESSES self.max_active_or_pending = max_active_or_pending or MAX_ACTIVE_OR_PENDING self.allow_abort_after = allow_abort_after or ALLOW_ABORT_AFTER self._pending_lock = Lock() self._pending = {} self._num_active_or_pending = 0 self._next_result_id = count() # Created in self.start() self.pool = None self.qIn = None self.qOut = None # Statistic self._counter_ok = [] self._counter_failed = [] self._counter_aborted = [] self._counter_skipped = [] self._state = ActionPoolState.INIT # Define __enter__ and __exit__ for with-statement def __enter__(self): self.start() return self def __exit__(self, type, value, traceback): # self.wait(timeout=2.0, terminate_workers=True) # moved into stop() self.stop() def _close_queues(self): # Caling after pool.close() guarantees that while-loops # are finite?! # # If some action was not started by the pool, qIn # contains data. try: while True: d = self.qIn.get_nowait() mp_logger.debug( "Non consumed data in queue 'qIn': {}".format(d)) except Empty: pass finally: self.qIn.close() try: while True: d = self.qOut.get_nowait() mp_logger.debug( "Non consumed data in queue 'qOut': {}".format(d)) except Empty: pass finally: self.qOut.close() def start(self): # set_start_method("spawn") # Too late here.... if self._state not in [ActionPoolState.INIT, ActionPoolState.STOPED]: logger.error("Pool can not be started twice.") return self.qIn = Queue() # Sends args to workers self.qOut = Queue() # Get results from workers # https://stackoverflow.com/a/35134329 #... but # 'This solution is not portable as it works only on Unix. # Moreover, it would not work if the user sets the maxtasksperchild # Pool parameter. The newly created processes would inherit # the standard SIGINT handler again. The pebble library disables # SIGINT by default for the user as soon as the new process is created.' if True: original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) self.pool = Pool( processes=self.processes, initializer=_ap_handler_init, initargs=[self.qIn, self.qOut], #initargs=[self.qIn, self.qOut, self.settings], # Important to re-fill pool after # terminating some child processes! # Mabye not needed in 'spawn'-context maxtasksperchild=1) signal.signal(signal.SIGINT, original_sigint_handler) else: # Hm, this hangs sometimes :-( self.pool = get_context("spawn").\ Pool(processes=self.processes, initializer=_ap_handler_init, initargs=[self.qIn, self.qOut], # initargs=[self.qIn, self.qOut, self.settings], # Important to re-fill pool after # terminating some child processes! # Mabye not needed in 'spawn'-context maxtasksperchild=1 ) self._state = ActionPoolState.STARTED logger.info("ActionPool started") def stop(self): if self._state not in [ActionPoolState.STARTED]: logger.error("Pool is not started.") return self.pool.close() self.fetch_started_ids() # clears qOut self._state = ActionPoolState.STOPED # Without this, child processes of workers # will survive deconstruction of this object. self.wait(timeout=1.0, terminate_workers=True) # Docs: 'joining process that uses queues needs to # read all data before .join() is called. # Otherwise .join() will block' # # NOTE: Well, reading all data is not a sufficient # condition but required. # Thus we need still pool.terminate() before pool.join()! # The call of _close_queues() is just for generating # log messages for debugging. self._close_queues() # This terminating resolves hanging pool.join() self.pool.terminate() self.pool.join() logger.info("ActionPool stoped") def push_action(self, f, args=()): #self.fetch_started_ids() # Gen id for this action rid = next(self._next_result_id) # NOTE: _num_active_or_pending <= len(self.pool._cache), # and not equal if we killed a process and didn't handled their # results/remove it from _cache by hand (?!) # Kill old open jobs if no space for more is left if self._num_active_or_pending >= self.max_active_or_pending: logger.debug("Start cleanup for {} pending " "actions".format(self._num_active_or_pending)) self.kill_stale_actions() # Re-check if space is available if self._num_active_or_pending >= self.max_active_or_pending: logger.debug("Cannot start new action. Still {} " "pending".format(self._num_active_or_pending)) self._counter_skipped.append(rid) return False # Put input arguments of action into Queue. logger.debug("Put rid={} in qIn".format(rid)) self.qIn.put_nowait((rid, f, args)) def action_succeded(t): mp_logger.debug("Action handler finished.") exitcode, rid, err, ret = t # Following leads to 'RuntimeError: dictionary changed size ...' # if not locked because this function is called from another thread self._pending_lock.acquire() self._pending.pop(rid, None) self._num_active_or_pending -= 1 self._pending_lock.release() if exitcode == 0: self._counter_ok.append(rid) else: self._counter_failed.append(rid) mp_logger.error("Action handler error: {}".format(err)) # Take (at least) this entry from qOut. self.fetch_started_ids() def action_failed(t): # Due try-catch construction in _ap_handler it will not # reached if the action (=f) failed. # Nevertheless this will be called if the process is # killed by c.terminate() mp_logger.debug("_ap_handler failed. Reason: {}".format(t)) return # Add action into pool (as argument for '_ap_handler') # NOTE: We can not give 'rid' as argument to _ap_handler # because the reading order of the queue can be scrambled. # Thus we need to read this value from qIn. result = self.pool.apply_async(_ap_handler, args=(), callback=action_succeded, error_callback=action_failed) self._pending_lock.acquire() self._pending[rid] = Pending(None, result, None) self._num_active_or_pending += 1 self._pending_lock.release() # Values for None-fields will be put in queue # if action-processes starts. Currently, they are unknown. return True def running_time_exceeded(self, start_time): return (time() - start_time > self.allow_abort_after) def fetch_started_ids(self): # Check which processes had alreaded started # and filled the queue while False: try: (rid, f, args) = self.qIn.get_nowait() logger.debug("\t\t\tHey, qIn not empty: {} {}".format(rid, f)) except Empty: break while True: try: [rid, pid, time] = self.qOut.get_nowait() # [rid, pid, time] = self.qOut.get(block=True, timeout=0.1) except Empty: break pend = self._pending.get(rid) if pend: self._pending_lock.acquire() self._pending[rid] = pend._replace(pid=pid, time=time) self._pending_lock.release() else: # Do not update entry because this action was # already finished and action_succeded() had removed # the entry from _pending pass def kill_stale_actions(self, number_to_kill=1): self.fetch_started_ids() # Check for new timestamps if number_to_kill <= 0: return to_remove = [] self._pending_lock.acquire() _pending_copy = self._pending.copy() self._pending_lock.release() for rid, pend in _pending_copy.items(): if pend.time is None: # This actions did not has started => no start time # available continue if self.running_time_exceeded(pend.time): # Find process for this pid for c in self.pool._pool: # print(pend.pid, c.pid) # Note that only N_PROCESSES different values for c.pid # are possible. # We assuming here that only one entry in _pending will match # because earlier processes are already removed from this dict. if c.pid == pend.pid: to_remove.append((rid, c)) number_to_kill -= 1 break if number_to_kill <= 0: break if to_remove: # logger.debug("\t\t\tLen active_children A: {}".\ # format(len(self.pool._pool))) pass for (rid, c) in to_remove: # Terminates children created by subprocess.Popen kill_children(c.pid, False) # Now terminate process logger.debug("Send SIGTERM to {}".format(c.pid)) if c.exitcode is None: c.terminate() if c.is_alive(): try: c.join(timeout=1.0) except TimeoutError: logger.debug("Joining failed") pass if c.exitcode is None: logger.debug("Send SIGKILL to {}".format(c.pid)) c.kill() self._pending_lock.acquire() pend = self._pending.pop(rid, None) if pend: self._num_active_or_pending -= 1 self._counter_aborted.append(rid) # Remove result from pool._cache try: # Hm, wrong thread?! Sometimes it is already removed from _cache pend.result._set(0, (False, TimeoutError("Stale action"))) except KeyError: pass self._pending_lock.release() if to_remove: # logger.debug("\t\t\tLen active_children B: {}".\ # format(len(self.pool._pool))) self.pool._repopulate_pool( ) # Hm, does not hold len(_pool) constant # logger.debug("\t\t\tLen active_children C: {}".\ # format(len(self.pool._pool))) def wait_debug(self): n = 0 while len(self.pool._cache) > 0: sleep(1.0) print(".", self.pool._cache.keys(), "|", self._num_active_or_pending, "P", len(self.pool._pool), end="\n") self.kill_stale_actions(self._num_active_or_pending) n += 1 if (n % 5 == 0): print("") def wait(self, timeout=None, terminate_workers=False): """ Give each pending action the guaranteed running time but kill them if they are not fast enough. (Thus duration(self.wait()) <= duration(self.pool.join()) Note/TODO: High ALLOW_ABORT_AFTER could leads to a very long blocking. """ if timeout is not None: end_time = time() + timeout abort_loop = 1000 # Just as fallback while self._num_active_or_pending > 0 or abort_loop == 0: abort_loop -= 1 # Waiting on first process. (Return of others ignored here) result = next(iter(self.pool._cache.values())) wait_time = self.allow_abort_after if timeout is not None: wait_time = min(wait_time, end_time - time()) if wait_time <= 0: logger.debug("ActionPool.wait() reached timeout") break try: result.get(wait_time) except TimeoutError: pass self.kill_stale_actions(self._num_active_or_pending) if timeout is not None and terminate_workers: self.kill_workers() if abort_loop == 0: raise RuntimeError("Some processes still running?!") def kill_workers(self): for c in self.pool._pool: # Terminates children created by subprocess.Popen kill_children(c.pid, False) if c.exitcode is None: c.terminate() try: c.join(timeout=1.0) except TimeoutError: pass if c.exitcode is None: c.kill() def statistic(self): # Due usage of multiprocessing.queues.SimpleQueue # and other problems it is hard do count the running # task. Just assume that all self.processes # will be used all the time. num_active = min(len(self.pool._cache), self.processes) return (" Tasks ok: {}\n" " Tasks skipped: {}\n" " Tasks aborted: {}\n" " Tasks failed: {}\n" "#Tasks active: {}\n" "#Tasks not begun: {}".format( self._counter_ok, self._counter_skipped, self._counter_aborted, self._counter_failed, num_active, self._num_active_or_pending - num_active, ))