Example #1
0
def check_worker_lost(task, analysis_pk):
    """
    SAFE GUARD: - Fail any tasks received from dead workers
    -------------------------------------------------------
    Setting the option `acks_late` means tasks will remain on the Queue until after
    a tasks has completed. If the worker goes down during the execution of `generate_input`
    or `start_analysis_task` then if another work is available the task will be picked up
    on an active worker.

    When the task is picked up for a 2nd time, the new worker will reject it will
    'WorkerLostError' and mark the execution as failed.

    Note that this is not the ideal approach, since at least one alive worker is required to
    fail as crash workers task.

    A better method is to use either tasks signals or celery events to fail the task immediately,
    so this should be viewed as a fallback option.
    """
    current_state = task.AsyncResult(task.request.id).state
    logging.info(current_state)
    if current_state == RUNNING_TASK_STATUS:
        raise WorkerLostError(
            'Task received from dead worker - A worker container crashed when executing a task from analysis_id={}'
            .format(analysis_pk))
    task.update_state(state=RUNNING_TASK_STATUS,
                      meta={'analysis_pk': analysis_pk})
Example #2
0
def _apply_target(target,
                  args=(),
                  kwargs={},
                  callback=None,
                  accept_callback=None,
                  pid=None,
                  getpid=os.getpid,
                  propagate=(),
                  monotonic=monotonic,
                  **_):
    if accept_callback:
        accept_callback(pid or getpid(), monotonic())
    try:
        ret = target(*args, **kwargs)
        if isawaitable(ret):
            ret = yield ret
    except propagate:
        raise
    except Exception:
        raise
    except (WorkerShutdown, WorkerTerminate):
        raise
    except BaseException as exc:
        try:
            reraise(WorkerLostError, WorkerLostError(repr(exc)),
                    sys.exc_info()[2])
        except WorkerLostError:
            callback(ExceptionInfo())
    else:
        callback(ret)
Example #3
0
def asynloop(obj,
             connection,
             consumer,
             blueprint,
             hub,
             qos,
             heartbeat,
             clock,
             hbrate=2.0):
    """Non-blocking event loop."""
    RUN = bootsteps.RUN
    update_qos = qos.update
    errors = connection.connection_errors

    on_task_received = obj.create_task_handler()

    _enable_amqheartbeats(hub.timer, connection, rate=hbrate)

    consumer.on_message = on_task_received
    obj.controller.register_with_event_loop(hub)
    obj.register_with_event_loop(hub)
    consumer.consume()
    obj.on_ready()

    # did_start_ok will verify that pool processes were able to start,
    # but this will only work the first time we start, as
    # maxtasksperchild will mess up metrics.
    if not obj.restart_count and not obj.pool.did_start_ok():
        raise WorkerLostError('Could not start worker processes')

    # consumer.consume() may have prefetched up to our
    # limit - drain an event so we're in a clean state
    # prior to starting our event loop.
    if connection.transport.driver_type == 'amqp':
        hub.call_soon(_quick_drain, connection)

    # FIXME: Use loop.run_forever
    # Tried and works, but no time to test properly before release.
    hub.propagate_errors = errors
    loop = hub.create_loop()

    try:
        while blueprint.state == RUN and obj.connection:
            state.maybe_shutdown()

            # We only update QoS when there's no more messages to read.
            # This groups together qos calls, and makes sure that remote
            # control commands will be prioritized over task messages.
            if qos.prev != qos.value:
                update_qos()

            try:
                next(loop)
            except StopIteration:
                loop = hub.create_loop()
    finally:
        try:
            hub.reset()
        except Exception as exc:  # pylint: disable=broad-except
            logger.exception('Error cleaning up after event loop: %r', exc)
Example #4
0
 def _join_exited_workers(self):
     """Cleanup after any worker processes which have exited due to
     reaching their specified lifetime. Returns True if any workers were
     cleaned up.
     """
     cleaned = []
     for i in reversed(range(len(self._pool))):
         worker = self._pool[i]
         if worker.exitcode is not None:
             # worker exited
             debug('cleaning up worker %d' % i)
             if self._putlock is not None:
                 try:
                     self._putlock.release()
                 except ValueError:
                     pass
             worker.join()
             cleaned.append(worker.pid)
             del self._pool[i]
     if cleaned:
         for job in self._cache.values():
             for worker_pid in job.worker_pids():
                 if worker_pid in cleaned:
                     err = WorkerLostError("Worker exited prematurely.")
                     job._set(None, (False, err))
                     continue
         return True
     return False
Example #5
0
 def test_on_failure__WorkerLostError(self):
     exc = WorkerLostError()
     job = self._test_on_failure(exc)
     job.task.backend.mark_as_failure.assert_called_with(
         job.id,
         exc,
         request=job._context,
         store_result=True,
     )
Example #6
0
 def test_on_failure_WorkerLostError_redelivered_None(self):
     einfo = None
     try:
         raise WorkerLostError()
     except:
         einfo = ExceptionInfo(internal=True)
     req = self.get_request(self.add.s(2, 2))
     req.task.acks_late = True
     req.task.reject_on_worker_lost = True
     req.delivery_info['redelivered'] = None
     req.on_failure(einfo)
     req.on_reject.assert_called_with(req_logger, req.connection_errors,
                                      True)
Example #7
0
    def _join_exited_workers(self, shutdown=False):
        """Cleanup after any worker processes which have exited due to
        reaching their specified lifetime. Returns True if any workers were
        cleaned up.
        """
        now = None
        # The worker may have published a result before being terminated,
        # but we have no way to accurately tell if it did.  So we wait for
        # _lost_worker_timeout seconds before we mark the job with
        # WorkerLostError.
        for job in [
                job for job in self._cache.values()
                if not job.ready() and job._worker_lost
        ]:
            now = now or time.time()
            if now - job._worker_lost > job._lost_worker_timeout:
                exc_info = None
                try:
                    raise WorkerLostError("Worker exited prematurely.")
                except WorkerLostError:
                    exc_info = ExceptionInfo(sys.exc_info())
                job._set(None, (False, exc_info))

        if shutdown and not len(self._pool):
            raise WorkersJoined()

        cleaned = []
        for i in reversed(range(len(self._pool))):
            worker = self._pool[i]
            if worker.exitcode is not None:
                # worker exited
                debug('Supervisor: cleaning up worker %d' % i)
                worker.join()
                debug('Supervisor: worked %d joined' % i)
                cleaned.append(worker.pid)
                del self._pool[i]
                del self._poolctrl[worker.pid]
        if cleaned:
            for job in self._cache.values():
                for worker_pid in job.worker_pids():
                    if worker_pid in cleaned and not job.ready():
                        job._worker_lost = time.time()
                        continue
            if self._putlock is not None:
                for worker in cleaned:
                    self._putlock.release()
            return True
        return False
Example #8
0
    def test_on_failure_WorkerLostError(self):
        tw = TaskRequest(mytask.name, gen_unique_id(), [1], {"f": "x"})
        try:
            raise WorkerLostError("do re mi")
        except WorkerLostError:
            exc_info = ExceptionInfo(sys.exc_info())
        tw.on_failure(exc_info)
        self.assertEqual(mytask.backend.get_status(tw.task_id), states.FAILURE)

        mytask.ignore_result = True
        try:
            tw = TaskRequest(mytask.name, gen_unique_id(), [1], {"f": "x"})
            tw.on_failure(exc_info)
            self.assertEqual(mytask.backend.get_status(tw.task_id),
                             states.PENDING)
        finally:
            mytask.ignore_result = False
Example #9
0
    def test_on_failure_WorkerLostError_rejects_with_requeue(self):
        try:
            raise WorkerLostError()
        except WorkerLostError:
            einfo = ExceptionInfo(internal=True)

        req = self.get_request(self.add.s(2, 2))
        req.task.acks_late = True
        req.task.reject_on_worker_lost = True
        req.delivery_info['redelivered'] = False
        req.task.backend = Mock()

        req.on_failure(einfo)

        req.on_reject.assert_called_with(req_logger, req.connection_errors,
                                         True)
        req.task.backend.mark_as_failure.assert_not_called()
Example #10
0
    def test_on_failure_acks_late_reject_on_worker_lost_enabled(self):
        try:
            raise WorkerLostError()
        except WorkerLostError:
            exc_info = ExceptionInfo()
        self.mytask.acks_late = True
        self.mytask.reject_on_worker_lost = True

        job = self.xRequest()
        job.delivery_info['redelivered'] = False
        job.on_failure(exc_info)

        assert self.mytask.backend.get_status(job.id) == states.PENDING

        job = self.xRequest()
        job.delivery_info['redelivered'] = True
        job.on_failure(exc_info)

        assert self.mytask.backend.get_status(job.id) == states.PENDING
Example #11
0
    def _join_exited_workers(self, lost_worker_timeout=10.0):
        """Cleanup after any worker processes which have exited due to
        reaching their specified lifetime. Returns True if any workers were
        cleaned up.
        """
        now = None
        # The worker may have published a result before being terminated,
        # but we have no way to accurately tell if it did.  So we wait for
        # 10 seconds before we mark the job with WorkerLostError.
        for job in [
                job for job in self._cache.values()
                if not job.ready() and job._worker_lost
        ]:
            now = now or time.time()
            if now - job._worker_lost > lost_worker_timeout:
                err = WorkerLostError("Worker exited prematurely.")
                job._set(None, (False, err))

        cleaned = []
        for i in reversed(range(len(self._pool))):
            worker = self._pool[i]
            if worker.exitcode is not None:
                # worker exited
                debug('cleaning up worker %d' % i)
                worker.join()
                cleaned.append(worker.pid)
                del self._pool[i]
        if cleaned:
            for job in self._cache.values():
                for worker_pid in job.worker_pids():
                    if worker_pid in cleaned and not job.ready():
                        if self._putlock is not None:
                            self._putlock.release()
                        job._worker_lost = time.time()
                        continue
            return True
        return False
Example #12
0
    def test_on_failure_WorkerLostError_redelivered_True(self):
        try:
            raise WorkerLostError()
        except WorkerLostError:
            einfo = ExceptionInfo(internal=True)

        req = self.get_request(self.add.s(2, 2))
        req.task.acks_late = False
        req.task.reject_on_worker_lost = True
        req.delivery_info['redelivered'] = True
        req.task.backend = Mock()

        with self.assert_signal_called(task_failure,
                                       sender=req.task,
                                       task_id=req.id,
                                       exception=einfo.exception,
                                       args=req.args,
                                       kwargs=req.kwargs,
                                       traceback=einfo.traceback,
                                       einfo=einfo):
            req.on_failure(einfo)

        req.task.backend.mark_as_failure.assert_called_once_with(
            req.id, einfo.exception, request=req._context, store_result=True)
Example #13
0
def asynloop(obj,
             connection,
             consumer,
             blueprint,
             hub,
             qos,
             heartbeat,
             clock,
             hbrate=2.0,
             RUN=RUN):
    """Non-blocking event loop consuming messages until connection is lost,
    or shutdown is requested."""
    update_qos = qos.update
    hbtick = connection.heartbeat_check
    errors = connection.connection_errors
    heartbeat = connection.get_heartbeat_interval()  # negotiated

    on_task_received = obj.create_task_handler()

    if heartbeat and connection.supports_heartbeats:
        hub.call_repeatedly(heartbeat / hbrate, hbtick, hbrate)

    consumer.on_message = on_task_received
    consumer.consume()
    obj.on_ready()
    obj.controller.register_with_event_loop(hub)
    obj.register_with_event_loop(hub)

    # did_start_ok will verify that pool processes were able to start,
    # but this will only work the first time we start, as
    # maxtasksperchild will mess up metrics.
    if not obj.restart_count and not obj.pool.did_start_ok():
        raise WorkerLostError('Could not start worker processes')

    # consumer.consume() may have prefetched up to our
    # limit - drain an event so we are in a clean state
    # prior to starting our event loop.
    if connection.transport.driver_type == 'amqp':
        hub.call_soon(_quick_drain, connection)

    # FIXME: Use loop.run_forever
    # Tried and works, but no time to test properly before release.
    hub.propagate_errors = errors
    loop = hub.create_loop()

    try:
        while blueprint.state == RUN and obj.connection:
            # shutdown if signal handlers told us to.
            should_stop, should_terminate = (
                state.should_stop,
                state.should_terminate,
            )
            # False == EX_OK, so must use is not False
            if should_stop is not None and should_stop is not False:
                raise WorkerShutdown(should_stop)
            elif should_terminate is not None and should_stop is not False:
                raise WorkerTerminate(should_terminate)

            # We only update QoS when there is no more messages to read.
            # This groups together qos calls, and makes sure that remote
            # control commands will be prioritized over task messages.
            if qos.prev != qos.value:
                update_qos()

            try:
                next(loop)
            except StopIteration:
                loop = hub.create_loop()
    finally:
        try:
            hub.reset()
        except Exception as exc:
            error(
                'Error cleaning up after event loop: %r',
                exc,
                exc_info=1,
            )
Example #14
0
 def get_ei():
     try:
         raise WorkerLostError("do re mi")
     except WorkerLostError:
         return ExceptionInfo(sys.exc_info())
Example #15
0
 def get_ei():
     try:
         raise WorkerLostError('do re mi')
     except WorkerLostError:
         return ExceptionInfo()
Example #16
0
def asynloop(obj,
             connection,
             consumer,
             blueprint,
             hub,
             qos,
             heartbeat,
             clock,
             hbrate=2.0,
             RUN=RUN):
    """Non-blocking event loop consuming messages until connection is lost,
    or shutdown is requested."""

    update_qos = qos.update
    readers, writers = hub.readers, hub.writers
    hbtick = connection.heartbeat_check
    errors = connection.connection_errors
    hub_add, hub_remove = hub.add, hub.remove

    on_task_received = obj.create_task_handler()

    if heartbeat and connection.supports_heartbeats:
        hub.call_repeatedly(heartbeat / hbrate, hbtick, hbrate)

    consumer.callbacks = [on_task_received]
    consumer.consume()
    obj.on_ready()
    obj.controller.register_with_event_loop(hub)
    obj.register_with_event_loop(hub)

    # did_start_ok will verify that pool processes were able to start,
    # but this will only work the first time we start, as
    # maxtasksperchild will mess up metrics.
    if not obj.restart_count and not obj.pool.did_start_ok():
        raise WorkerLostError('Could not start worker processes')

    # FIXME: Use loop.run_forever
    # Tried and works, but no time to test properly before release.
    hub.propagate_errors = errors
    loop = hub.create_loop()

    try:
        while blueprint.state == RUN and obj.connection:
            # shutdown if signal handlers told us to.
            if state.should_stop:
                raise SystemExit()
            elif state.should_terminate:
                raise SystemTerminate()

            # We only update QoS when there is no more messages to read.
            # This groups together qos calls, and makes sure that remote
            # control commands will be prioritized over task messages.
            if qos.prev != qos.value:
                update_qos()

            try:
                next(loop)
            except StopIteration:
                loop = hub.create_loop()
    finally:
        try:
            hub.close()
        except Exception as exc:
            error(
                'Error cleaning up after event loop: %r',
                exc,
                exc_info=1,
            )