Example #1
0
def handle_worker_offline(event):
    """
    Celery event handler for 'worker-offline' events.

    The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not
    emitted when a worker is killed instantly.

    The event is first parsed and logged. If this event is from the resource manager, there is
    no further processing to be done. Otherwise, a worker is shutting down, and a
    _delete_worker() task is dispatched so that the resource manager will remove the record,
    and handle any work cleanup associated with a worker going offline. Logging at the info
    and debug level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)

    # if this is the resource_manager do nothing
    if _is_resource_manager(event):
        return

    msg = _("Worker '%(worker_name)s' shutdown") % event_info
    _logger.info(msg)
    _delete_worker.apply_async(args=(event_info['worker_name'], ),
                               kwargs={'normal_shutdown': True},
                               queue=RESOURCE_MANAGER_QUEUE)
Example #2
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _('Looking for workers missing for more than %s seconds'
                ) % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(
            seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_criteria = Criteria(
            filters={'last_heartbeat': {
                '$lt': oldest_heartbeat_time
            }},
            fields=('_id', 'last_heartbeat', 'num_reservations'))
        worker_list = list(resources.filter_workers(worker_criteria))
        for worker in worker_list:
            msg = _(
                "Workers '%s' has gone missing, removing from list of workers"
            ) % worker.name
            _logger.error(msg)
            _delete_worker.apply_async(args=(worker.name, ),
                                       queue=RESOURCE_MANAGER_QUEUE)
Example #3
0
def handle_worker_offline(event):
    """
    Celery event handler for 'worker-offline' events.

    The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not
    emitted when a worker is killed instantly.

    The event is first parsed and logged. If this event is from the resource manager, there is
    no further processing to be done. Otherwise, a worker is shutting down, and a
    _delete_worker() task is dispatched so that the resource manager will remove the record,
    and handle any work cleanup associated with a worker going offline. Logging at the info
    and debug level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)

    # if this is the resource_manager do nothing
    if _is_resource_manager(event):
        return

    msg = _("Worker '%(worker_name)s' shutdown") % event_info
    _logger.info(msg)
    _delete_worker.apply_async(args=(event_info['worker_name'],),
                              kwargs={'normal_shutdown': True},
                              queue=RESOURCE_MANAGER_QUEUE)
Example #4
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _('Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_criteria = Criteria(filters={'last_heartbeat': {'$lt': oldest_heartbeat_time}},
                                   fields=('_id', 'last_heartbeat', 'num_reservations'))
        worker_list = list(resources.filter_workers(worker_criteria))
        for worker in worker_list:
            msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name
            _logger.error(msg)
            _delete_worker.apply_async(args=(worker.name,), queue=RESOURCE_MANAGER_QUEUE)