def check_worker_lost(task, analysis_pk): """ SAFE GUARD: - Fail any tasks received from dead workers ------------------------------------------------------- Setting the option `acks_late` means tasks will remain on the Queue until after a tasks has completed. If the worker goes down during the execution of `generate_input` or `start_analysis_task` then if another work is available the task will be picked up on an active worker. When the task is picked up for a 2nd time, the new worker will reject it will 'WorkerLostError' and mark the execution as failed. Note that this is not the ideal approach, since at least one alive worker is required to fail as crash workers task. A better method is to use either tasks signals or celery events to fail the task immediately, so this should be viewed as a fallback option. """ current_state = task.AsyncResult(task.request.id).state logging.info(current_state) if current_state == RUNNING_TASK_STATUS: raise WorkerLostError( 'Task received from dead worker - A worker container crashed when executing a task from analysis_id={}' .format(analysis_pk)) task.update_state(state=RUNNING_TASK_STATUS, meta={'analysis_pk': analysis_pk})
def _apply_target(target, args=(), kwargs={}, callback=None, accept_callback=None, pid=None, getpid=os.getpid, propagate=(), monotonic=monotonic, **_): if accept_callback: accept_callback(pid or getpid(), monotonic()) try: ret = target(*args, **kwargs) if isawaitable(ret): ret = yield ret except propagate: raise except Exception: raise except (WorkerShutdown, WorkerTerminate): raise except BaseException as exc: try: reraise(WorkerLostError, WorkerLostError(repr(exc)), sys.exc_info()[2]) except WorkerLostError: callback(ExceptionInfo()) else: callback(ret)
def asynloop(obj, connection, consumer, blueprint, hub, qos, heartbeat, clock, hbrate=2.0): """Non-blocking event loop.""" RUN = bootsteps.RUN update_qos = qos.update errors = connection.connection_errors on_task_received = obj.create_task_handler() _enable_amqheartbeats(hub.timer, connection, rate=hbrate) consumer.on_message = on_task_received obj.controller.register_with_event_loop(hub) obj.register_with_event_loop(hub) consumer.consume() obj.on_ready() # did_start_ok will verify that pool processes were able to start, # but this will only work the first time we start, as # maxtasksperchild will mess up metrics. if not obj.restart_count and not obj.pool.did_start_ok(): raise WorkerLostError('Could not start worker processes') # consumer.consume() may have prefetched up to our # limit - drain an event so we're in a clean state # prior to starting our event loop. if connection.transport.driver_type == 'amqp': hub.call_soon(_quick_drain, connection) # FIXME: Use loop.run_forever # Tried and works, but no time to test properly before release. hub.propagate_errors = errors loop = hub.create_loop() try: while blueprint.state == RUN and obj.connection: state.maybe_shutdown() # We only update QoS when there's no more messages to read. # This groups together qos calls, and makes sure that remote # control commands will be prioritized over task messages. if qos.prev != qos.value: update_qos() try: next(loop) except StopIteration: loop = hub.create_loop() finally: try: hub.reset() except Exception as exc: # pylint: disable=broad-except logger.exception('Error cleaning up after event loop: %r', exc)
def _join_exited_workers(self): """Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ cleaned = [] for i in reversed(range(len(self._pool))): worker = self._pool[i] if worker.exitcode is not None: # worker exited debug('cleaning up worker %d' % i) if self._putlock is not None: try: self._putlock.release() except ValueError: pass worker.join() cleaned.append(worker.pid) del self._pool[i] if cleaned: for job in self._cache.values(): for worker_pid in job.worker_pids(): if worker_pid in cleaned: err = WorkerLostError("Worker exited prematurely.") job._set(None, (False, err)) continue return True return False
def test_on_failure__WorkerLostError(self): exc = WorkerLostError() job = self._test_on_failure(exc) job.task.backend.mark_as_failure.assert_called_with( job.id, exc, request=job._context, store_result=True, )
def test_on_failure_WorkerLostError_redelivered_None(self): einfo = None try: raise WorkerLostError() except: einfo = ExceptionInfo(internal=True) req = self.get_request(self.add.s(2, 2)) req.task.acks_late = True req.task.reject_on_worker_lost = True req.delivery_info['redelivered'] = None req.on_failure(einfo) req.on_reject.assert_called_with(req_logger, req.connection_errors, True)
def _join_exited_workers(self, shutdown=False): """Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ now = None # The worker may have published a result before being terminated, # but we have no way to accurately tell if it did. So we wait for # _lost_worker_timeout seconds before we mark the job with # WorkerLostError. for job in [ job for job in self._cache.values() if not job.ready() and job._worker_lost ]: now = now or time.time() if now - job._worker_lost > job._lost_worker_timeout: exc_info = None try: raise WorkerLostError("Worker exited prematurely.") except WorkerLostError: exc_info = ExceptionInfo(sys.exc_info()) job._set(None, (False, exc_info)) if shutdown and not len(self._pool): raise WorkersJoined() cleaned = [] for i in reversed(range(len(self._pool))): worker = self._pool[i] if worker.exitcode is not None: # worker exited debug('Supervisor: cleaning up worker %d' % i) worker.join() debug('Supervisor: worked %d joined' % i) cleaned.append(worker.pid) del self._pool[i] del self._poolctrl[worker.pid] if cleaned: for job in self._cache.values(): for worker_pid in job.worker_pids(): if worker_pid in cleaned and not job.ready(): job._worker_lost = time.time() continue if self._putlock is not None: for worker in cleaned: self._putlock.release() return True return False
def test_on_failure_WorkerLostError(self): tw = TaskRequest(mytask.name, gen_unique_id(), [1], {"f": "x"}) try: raise WorkerLostError("do re mi") except WorkerLostError: exc_info = ExceptionInfo(sys.exc_info()) tw.on_failure(exc_info) self.assertEqual(mytask.backend.get_status(tw.task_id), states.FAILURE) mytask.ignore_result = True try: tw = TaskRequest(mytask.name, gen_unique_id(), [1], {"f": "x"}) tw.on_failure(exc_info) self.assertEqual(mytask.backend.get_status(tw.task_id), states.PENDING) finally: mytask.ignore_result = False
def test_on_failure_WorkerLostError_rejects_with_requeue(self): try: raise WorkerLostError() except WorkerLostError: einfo = ExceptionInfo(internal=True) req = self.get_request(self.add.s(2, 2)) req.task.acks_late = True req.task.reject_on_worker_lost = True req.delivery_info['redelivered'] = False req.task.backend = Mock() req.on_failure(einfo) req.on_reject.assert_called_with(req_logger, req.connection_errors, True) req.task.backend.mark_as_failure.assert_not_called()
def test_on_failure_acks_late_reject_on_worker_lost_enabled(self): try: raise WorkerLostError() except WorkerLostError: exc_info = ExceptionInfo() self.mytask.acks_late = True self.mytask.reject_on_worker_lost = True job = self.xRequest() job.delivery_info['redelivered'] = False job.on_failure(exc_info) assert self.mytask.backend.get_status(job.id) == states.PENDING job = self.xRequest() job.delivery_info['redelivered'] = True job.on_failure(exc_info) assert self.mytask.backend.get_status(job.id) == states.PENDING
def _join_exited_workers(self, lost_worker_timeout=10.0): """Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ now = None # The worker may have published a result before being terminated, # but we have no way to accurately tell if it did. So we wait for # 10 seconds before we mark the job with WorkerLostError. for job in [ job for job in self._cache.values() if not job.ready() and job._worker_lost ]: now = now or time.time() if now - job._worker_lost > lost_worker_timeout: err = WorkerLostError("Worker exited prematurely.") job._set(None, (False, err)) cleaned = [] for i in reversed(range(len(self._pool))): worker = self._pool[i] if worker.exitcode is not None: # worker exited debug('cleaning up worker %d' % i) worker.join() cleaned.append(worker.pid) del self._pool[i] if cleaned: for job in self._cache.values(): for worker_pid in job.worker_pids(): if worker_pid in cleaned and not job.ready(): if self._putlock is not None: self._putlock.release() job._worker_lost = time.time() continue return True return False
def test_on_failure_WorkerLostError_redelivered_True(self): try: raise WorkerLostError() except WorkerLostError: einfo = ExceptionInfo(internal=True) req = self.get_request(self.add.s(2, 2)) req.task.acks_late = False req.task.reject_on_worker_lost = True req.delivery_info['redelivered'] = True req.task.backend = Mock() with self.assert_signal_called(task_failure, sender=req.task, task_id=req.id, exception=einfo.exception, args=req.args, kwargs=req.kwargs, traceback=einfo.traceback, einfo=einfo): req.on_failure(einfo) req.task.backend.mark_as_failure.assert_called_once_with( req.id, einfo.exception, request=req._context, store_result=True)
def asynloop(obj, connection, consumer, blueprint, hub, qos, heartbeat, clock, hbrate=2.0, RUN=RUN): """Non-blocking event loop consuming messages until connection is lost, or shutdown is requested.""" update_qos = qos.update hbtick = connection.heartbeat_check errors = connection.connection_errors heartbeat = connection.get_heartbeat_interval() # negotiated on_task_received = obj.create_task_handler() if heartbeat and connection.supports_heartbeats: hub.call_repeatedly(heartbeat / hbrate, hbtick, hbrate) consumer.on_message = on_task_received consumer.consume() obj.on_ready() obj.controller.register_with_event_loop(hub) obj.register_with_event_loop(hub) # did_start_ok will verify that pool processes were able to start, # but this will only work the first time we start, as # maxtasksperchild will mess up metrics. if not obj.restart_count and not obj.pool.did_start_ok(): raise WorkerLostError('Could not start worker processes') # consumer.consume() may have prefetched up to our # limit - drain an event so we are in a clean state # prior to starting our event loop. if connection.transport.driver_type == 'amqp': hub.call_soon(_quick_drain, connection) # FIXME: Use loop.run_forever # Tried and works, but no time to test properly before release. hub.propagate_errors = errors loop = hub.create_loop() try: while blueprint.state == RUN and obj.connection: # shutdown if signal handlers told us to. should_stop, should_terminate = ( state.should_stop, state.should_terminate, ) # False == EX_OK, so must use is not False if should_stop is not None and should_stop is not False: raise WorkerShutdown(should_stop) elif should_terminate is not None and should_stop is not False: raise WorkerTerminate(should_terminate) # We only update QoS when there is no more messages to read. # This groups together qos calls, and makes sure that remote # control commands will be prioritized over task messages. if qos.prev != qos.value: update_qos() try: next(loop) except StopIteration: loop = hub.create_loop() finally: try: hub.reset() except Exception as exc: error( 'Error cleaning up after event loop: %r', exc, exc_info=1, )
def get_ei(): try: raise WorkerLostError("do re mi") except WorkerLostError: return ExceptionInfo(sys.exc_info())
def get_ei(): try: raise WorkerLostError('do re mi') except WorkerLostError: return ExceptionInfo()
def asynloop(obj, connection, consumer, blueprint, hub, qos, heartbeat, clock, hbrate=2.0, RUN=RUN): """Non-blocking event loop consuming messages until connection is lost, or shutdown is requested.""" update_qos = qos.update readers, writers = hub.readers, hub.writers hbtick = connection.heartbeat_check errors = connection.connection_errors hub_add, hub_remove = hub.add, hub.remove on_task_received = obj.create_task_handler() if heartbeat and connection.supports_heartbeats: hub.call_repeatedly(heartbeat / hbrate, hbtick, hbrate) consumer.callbacks = [on_task_received] consumer.consume() obj.on_ready() obj.controller.register_with_event_loop(hub) obj.register_with_event_loop(hub) # did_start_ok will verify that pool processes were able to start, # but this will only work the first time we start, as # maxtasksperchild will mess up metrics. if not obj.restart_count and not obj.pool.did_start_ok(): raise WorkerLostError('Could not start worker processes') # FIXME: Use loop.run_forever # Tried and works, but no time to test properly before release. hub.propagate_errors = errors loop = hub.create_loop() try: while blueprint.state == RUN and obj.connection: # shutdown if signal handlers told us to. if state.should_stop: raise SystemExit() elif state.should_terminate: raise SystemTerminate() # We only update QoS when there is no more messages to read. # This groups together qos calls, and makes sure that remote # control commands will be prioritized over task messages. if qos.prev != qos.value: update_qos() try: next(loop) except StopIteration: loop = hub.create_loop() finally: try: hub.close() except Exception as exc: error( 'Error cleaning up after event loop: %r', exc, exc_info=1, )