def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker(worker.name)
def handle_worker_offline(event): """ Celery event handler for 'worker-offline' events. The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not emitted when a worker is killed instantly. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Otherwise, a worker is shutting down, and a _delete_worker() task is dispatched so that the resource manager will remove the record, and handle any work cleanup associated with a worker going offline. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return msg = _("Worker '%(worker_name)s' shutdown") % event_info _logger.info(msg) _delete_worker(event_info['worker_name'], normal_shutdown=True)
def check_celery_processes(self): """ Look for missing Celery processes, log and cleanup as needed. To find a missing Celery process, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, call _delete_worker() synchronously for cleanup. This method also checks that at least one resource_manager and one scheduler process is present. If there are zero of either, log at the error level that Pulp will not operate correctly. """ msg = _( 'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager ' 'processes are missing for more than %d seconds' ) % self.CELERY_TIMEOUT_SECONDS _logger.debug(msg) now = ensure_tz(datetime.utcnow()) oldest_heartbeat_time = now - timedelta( seconds=self.CELERY_TIMEOUT_SECONDS) worker_list = Worker.objects.all() worker_count = 0 resource_manager_count = 0 scheduler_count = 0 for worker in worker_list: if worker.last_heartbeat < oldest_heartbeat_time: msg = _( "Worker '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker(worker.name) elif worker.name.startswith(SCHEDULER_WORKER_NAME): scheduler_count = scheduler_count + 1 elif worker.name.startswith(RESOURCE_MANAGER_WORKER_NAME): resource_manager_count = resource_manager_count + 1 else: worker_count = worker_count + 1 if resource_manager_count == 0: msg = _( "There are 0 pulp_resource_manager processes running. Pulp will not operate " "correctly without at least one pulp_resource_mananger process running." ) _logger.error(msg) if scheduler_count == 0: msg = _( "There are 0 pulp_celerybeat processes running. Pulp will not operate " "correctly without at least one pulp_celerybeat process running." ) _logger.error(msg) output_dict = { 'workers': worker_count, 'celerybeat': scheduler_count, 'resource_manager': resource_manager_count } msg = _("%(workers)d pulp_worker processes, %(celerybeat)d " "pulp_celerybeat processes, and %(resource_manager)d " "pulp_resource_manager processes") % output_dict _logger.debug(msg)
def check_celery_processes(self): """ Look for missing Celery processes, log and cleanup as needed. To find a missing Celery process, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, call _delete_worker() synchronously for cleanup. This method also checks that at least one resource_manager and one scheduler process is present. If there are zero of either, log at the error level that Pulp will not operate correctly. """ msg = _('Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes ' 'are missing for more than %d seconds') % constants.PULP_PROCESS_TIMEOUT_INTERVAL _logger.debug(msg) now = ensure_tz(datetime.utcnow()) oldest_heartbeat_time = now - timedelta(seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL) worker_list = Worker.objects.all() worker_count = 0 resource_manager_count = 0 scheduler_count = 0 for worker in worker_list: if worker.last_heartbeat < oldest_heartbeat_time: msg = _("Worker '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) if worker.name.startswith(constants.SCHEDULER_WORKER_NAME): worker.delete() else: _delete_worker(worker.name) elif worker.name.startswith(constants.SCHEDULER_WORKER_NAME): scheduler_count = scheduler_count + 1 elif worker.name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME): resource_manager_count = resource_manager_count + 1 else: worker_count = worker_count + 1 if resource_manager_count == 0: msg = _("There are 0 pulp_resource_manager processes running. Pulp will not operate " "correctly without at least one pulp_resource_mananger process running.") _logger.error(msg) if scheduler_count == 0: msg = _("There are 0 pulp_celerybeat processes running. Pulp will not operate " "correctly without at least one pulp_celerybeat process running.") _logger.error(msg) output_dict = {'workers': worker_count, 'celerybeat': scheduler_count, 'resource_manager': resource_manager_count} msg = _("%(workers)d pulp_worker processes, %(celerybeat)d " "pulp_celerybeat processes, and %(resource_manager)d " "pulp_resource_manager processes") % output_dict _logger.debug(msg)
def handle_worker_offline(worker_name): """ This is a generic function for handling workers going offline. _delete_worker() task is called to handle any work cleanup associated with a worker going offline. Logging at the info level is also done. :param worker_name: The hostname of the worker :type worker_name: basestring """ msg = _("Worker '%s' shutdown") % worker_name _logger.info(msg) _delete_worker(worker_name, normal_shutdown=True)
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _( 'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker(worker.name)
def check_celery_processes(self): """ Look for missing Celery processes, log and cleanup as needed. To find a missing Celery process, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, call _delete_worker() synchronously for cleanup. This method also checks that at least one resource_manager and one scheduler process is present. If there are zero of either, log at the error level that Pulp will not operate correctly. """ msg = _( 'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes ' 'are missing for more than %d seconds' ) % PULP_PROCESS_TIMEOUT_INTERVAL _logger.debug(msg) now = ensure_tz(datetime.utcnow()) oldest_heartbeat_time = now - timedelta( seconds=PULP_PROCESS_TIMEOUT_INTERVAL) worker_list = Worker.objects.all() if (ensure_tz(datetime.utcnow()) - now > timedelta(seconds=PULP_PROCESS_HEARTBEAT_INTERVAL)): sec = (ensure_tz(datetime.utcnow()) - now).total_seconds() msg = _( "Celery process check took {time}s which exceeds heartbeat interval. Consider " "adjusting the worker_timeout setting.").format(time=sec) _logger.warn(msg) worker_count = 0 resource_manager_count = 0 scheduler_count = 0 for worker in worker_list: if worker.last_heartbeat < oldest_heartbeat_time: msg = _( "Worker '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) if worker.name.startswith(constants.SCHEDULER_WORKER_NAME): worker.delete() else: _delete_worker(worker.name) elif worker.name.startswith(constants.SCHEDULER_WORKER_NAME): scheduler_count = scheduler_count + 1 elif worker.name.startswith( constants.RESOURCE_MANAGER_WORKER_NAME): resource_manager_count = resource_manager_count + 1 else: worker_count = worker_count + 1 if resource_manager_count == 0: msg = _( "There are 0 pulp_resource_manager processes running. Pulp will not operate " "correctly without at least one pulp_resource_manager process running." ) _logger.error(msg) if scheduler_count == 0: msg = _( "There are 0 pulp_celerybeat processes running. Pulp will not operate " "correctly without at least one pulp_celerybeat process running." ) _logger.error(msg) output_dict = { 'workers': worker_count, 'celerybeat': scheduler_count, 'resource_manager': resource_manager_count } msg = _("%(workers)d pulp_worker processes, %(celerybeat)d " "pulp_celerybeat processes, and %(resource_manager)d " "pulp_resource_manager processes") % output_dict _logger.debug(msg)
def close(self): """This is called when celerybeat is being shutdown.""" _delete_worker(CELERYBEAT_NAME, normal_shutdown=True) super(Scheduler, self).close()
def close(self): _delete_worker(self.celerybeat_name, normal_shutdown=True) super(Scheduler, self).close()