Esempio n. 1
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _('Looking for workers missing for more than %s seconds'
                ) % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(
            seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time)
        for worker in worker_list:
            msg = _(
                "Workers '%s' has gone missing, removing from list of workers"
            ) % worker.name
            _logger.error(msg)
            _delete_worker(worker.name)
Esempio n. 2
0
def handle_worker_offline(event):
    """
    Celery event handler for 'worker-offline' events.

    The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not
    emitted when a worker is killed instantly.

    The event is first parsed and logged. If this event is from the resource manager, there is
    no further processing to be done. Otherwise, a worker is shutting down, and a
    _delete_worker() task is dispatched so that the resource manager will remove the record,
    and handle any work cleanup associated with a worker going offline. Logging at the info
    and debug level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)

    # if this is the resource_manager do nothing
    if _is_resource_manager(event):
        return

    msg = _("Worker '%(worker_name)s' shutdown") % event_info
    _logger.info(msg)
    _delete_worker(event_info['worker_name'], normal_shutdown=True)
Esempio n. 3
0
def handle_worker_offline(event):
    """
    Celery event handler for 'worker-offline' events.

    The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not
    emitted when a worker is killed instantly.

    The event is first parsed and logged. If this event is from the resource manager, there is
    no further processing to be done. Otherwise, a worker is shutting down, and a
    _delete_worker() task is dispatched so that the resource manager will remove the record,
    and handle any work cleanup associated with a worker going offline. Logging at the info
    and debug level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)

    # if this is the resource_manager do nothing
    if _is_resource_manager(event):
        return

    msg = _("Worker '%(worker_name)s' shutdown") % event_info
    _logger.info(msg)
    _delete_worker(event_info['worker_name'], normal_shutdown=True)
Esempio n. 4
0
    def check_celery_processes(self):
        """
        Look for missing Celery processes, log and cleanup as needed.

        To find a missing Celery process, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime. For each missing worker found, call _delete_worker() synchronously
        for cleanup.

        This method also checks that at least one resource_manager and one scheduler process is
        present. If there are zero of either, log at the error level that Pulp will not operate
        correctly.
        """
        msg = _(
            'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager '
            'processes are missing for more than %d seconds'
        ) % self.CELERY_TIMEOUT_SECONDS
        _logger.debug(msg)
        now = ensure_tz(datetime.utcnow())
        oldest_heartbeat_time = now - timedelta(
            seconds=self.CELERY_TIMEOUT_SECONDS)
        worker_list = Worker.objects.all()
        worker_count = 0
        resource_manager_count = 0
        scheduler_count = 0
        for worker in worker_list:
            if worker.last_heartbeat < oldest_heartbeat_time:
                msg = _(
                    "Worker '%s' has gone missing, removing from list of workers"
                ) % worker.name
                _logger.error(msg)
                _delete_worker(worker.name)
            elif worker.name.startswith(SCHEDULER_WORKER_NAME):
                scheduler_count = scheduler_count + 1
            elif worker.name.startswith(RESOURCE_MANAGER_WORKER_NAME):
                resource_manager_count = resource_manager_count + 1
            else:
                worker_count = worker_count + 1
        if resource_manager_count == 0:
            msg = _(
                "There are 0 pulp_resource_manager processes running. Pulp will not operate "
                "correctly without at least one pulp_resource_mananger process running."
            )
            _logger.error(msg)
        if scheduler_count == 0:
            msg = _(
                "There are 0 pulp_celerybeat processes running. Pulp will not operate "
                "correctly without at least one pulp_celerybeat process running."
            )
            _logger.error(msg)
        output_dict = {
            'workers': worker_count,
            'celerybeat': scheduler_count,
            'resource_manager': resource_manager_count
        }
        msg = _("%(workers)d pulp_worker processes, %(celerybeat)d "
                "pulp_celerybeat processes, and %(resource_manager)d "
                "pulp_resource_manager processes") % output_dict
        _logger.debug(msg)
Esempio n. 5
0
    def check_celery_processes(self):
        """
        Look for missing Celery processes, log and cleanup as needed.

        To find a missing Celery process, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime. For each missing worker found, call _delete_worker() synchronously
        for cleanup.

        This method also checks that at least one resource_manager and one scheduler process is
        present. If there are zero of either, log at the error level that Pulp will not operate
        correctly.
        """
        msg = _('Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes '
                'are missing for more than %d seconds') % constants.PULP_PROCESS_TIMEOUT_INTERVAL
        _logger.debug(msg)
        now = ensure_tz(datetime.utcnow())
        oldest_heartbeat_time = now - timedelta(seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL)
        worker_list = Worker.objects.all()
        worker_count = 0
        resource_manager_count = 0
        scheduler_count = 0

        for worker in worker_list:
            if worker.last_heartbeat < oldest_heartbeat_time:
                msg = _("Worker '%s' has gone missing, removing from list of workers") % worker.name
                _logger.error(msg)

                if worker.name.startswith(constants.SCHEDULER_WORKER_NAME):
                    worker.delete()
                else:
                    _delete_worker(worker.name)
            elif worker.name.startswith(constants.SCHEDULER_WORKER_NAME):
                scheduler_count = scheduler_count + 1
            elif worker.name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME):
                resource_manager_count = resource_manager_count + 1
            else:
                worker_count = worker_count + 1

        if resource_manager_count == 0:
            msg = _("There are 0 pulp_resource_manager processes running. Pulp will not operate "
                    "correctly without at least one pulp_resource_mananger process running.")
            _logger.error(msg)

        if scheduler_count == 0:
            msg = _("There are 0 pulp_celerybeat processes running. Pulp will not operate "
                    "correctly without at least one pulp_celerybeat process running.")
            _logger.error(msg)
        output_dict = {'workers': worker_count, 'celerybeat': scheduler_count,
                       'resource_manager': resource_manager_count}
        msg = _("%(workers)d pulp_worker processes, %(celerybeat)d "
                "pulp_celerybeat processes, and %(resource_manager)d "
                "pulp_resource_manager processes") % output_dict
        _logger.debug(msg)
Esempio n. 6
0
def handle_worker_offline(worker_name):
    """
    This is a generic function for handling workers going offline.

    _delete_worker() task is called to handle any work cleanup associated with a worker going
    offline. Logging at the info level is also done.

    :param worker_name: The hostname of the worker
    :type  worker_name: basestring
    """
    msg = _("Worker '%s' shutdown") % worker_name
    _logger.info(msg)
    _delete_worker(worker_name, normal_shutdown=True)
Esempio n. 7
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _(
            'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time)
        for worker in worker_list:
            msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name
            _logger.error(msg)
            _delete_worker(worker.name)
Esempio n. 8
0
    def check_celery_processes(self):
        """
        Look for missing Celery processes, log and cleanup as needed.

        To find a missing Celery process, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime. For each missing worker found, call _delete_worker() synchronously
        for cleanup.

        This method also checks that at least one resource_manager and one scheduler process is
        present. If there are zero of either, log at the error level that Pulp will not operate
        correctly.
        """
        msg = _(
            'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes '
            'are missing for more than %d seconds'
        ) % PULP_PROCESS_TIMEOUT_INTERVAL
        _logger.debug(msg)
        now = ensure_tz(datetime.utcnow())
        oldest_heartbeat_time = now - timedelta(
            seconds=PULP_PROCESS_TIMEOUT_INTERVAL)
        worker_list = Worker.objects.all()

        if (ensure_tz(datetime.utcnow()) - now >
                timedelta(seconds=PULP_PROCESS_HEARTBEAT_INTERVAL)):
            sec = (ensure_tz(datetime.utcnow()) - now).total_seconds()
            msg = _(
                "Celery process check took {time}s which exceeds heartbeat interval. Consider "
                "adjusting the worker_timeout setting.").format(time=sec)
            _logger.warn(msg)

        worker_count = 0
        resource_manager_count = 0
        scheduler_count = 0

        for worker in worker_list:
            if worker.last_heartbeat < oldest_heartbeat_time:
                msg = _(
                    "Worker '%s' has gone missing, removing from list of workers"
                ) % worker.name
                _logger.error(msg)

                if worker.name.startswith(constants.SCHEDULER_WORKER_NAME):
                    worker.delete()
                else:
                    _delete_worker(worker.name)
            elif worker.name.startswith(constants.SCHEDULER_WORKER_NAME):
                scheduler_count = scheduler_count + 1
            elif worker.name.startswith(
                    constants.RESOURCE_MANAGER_WORKER_NAME):
                resource_manager_count = resource_manager_count + 1
            else:
                worker_count = worker_count + 1

        if resource_manager_count == 0:
            msg = _(
                "There are 0 pulp_resource_manager processes running. Pulp will not operate "
                "correctly without at least one pulp_resource_manager process running."
            )
            _logger.error(msg)

        if scheduler_count == 0:
            msg = _(
                "There are 0 pulp_celerybeat processes running. Pulp will not operate "
                "correctly without at least one pulp_celerybeat process running."
            )
            _logger.error(msg)
        output_dict = {
            'workers': worker_count,
            'celerybeat': scheduler_count,
            'resource_manager': resource_manager_count
        }
        msg = _("%(workers)d pulp_worker processes, %(celerybeat)d "
                "pulp_celerybeat processes, and %(resource_manager)d "
                "pulp_resource_manager processes") % output_dict
        _logger.debug(msg)
Esempio n. 9
0
 def close(self):
     """This is called when celerybeat is being shutdown."""
     _delete_worker(CELERYBEAT_NAME, normal_shutdown=True)
     super(Scheduler, self).close()
Esempio n. 10
0
 def close(self):
     _delete_worker(self.celerybeat_name, normal_shutdown=True)
     super(Scheduler, self).close()
Esempio n. 11
0
 def close(self):
     _delete_worker(self.celerybeat_name, normal_shutdown=True)
     super(Scheduler, self).close()