def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. IMPORTANT: this function is one of the few places in the dispatcher (aside from setting lookups) where we talk to the database. As such, if there's an outage, this method _can_ throw various django.db.utils.Error exceptions. Act accordingly. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format(w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter(celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception('failed to reap job UUID {}'.format(w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.warn('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) reaper.reap(excluded_uuids=running_uuids)
def reap_jobs_from_orphaned_instances(self): # discover jobs that are in running state but aren't on an execution node # that we know about; this is a fairly rare event, but it can occur if you, # for example, SQL backup an awx install with running jobs and restore it # elsewhere for j in UnifiedJob.objects.filter( status__in=['pending', 'waiting', 'running'], ).exclude(execution_node__in=Instance.objects.values_list('hostname', flat=True)): if j.execution_node and not j.is_container_group_task: logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}') reap_job(j, 'failed')
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format( w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter( celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception( 'failed to reap job UUID {}'.format( w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.warn('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m)
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format( w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter( celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception( 'failed to reap job UUID {}'.format( w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.warn('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) try: reaper.reap(excluded_uuids=running_uuids) except Exception: # we _probably_ failed here due to DB connectivity issues, so # don't use our logger (it accesses the database for configuration) _, _, tb = sys.exc_info() traceback.print_tb(tb)
def cleanup(self): """ Perform some internal account and cleanup. This is run on every cluster node heartbeat: 1. Discover worker processes that exited, and recover messages they were handling. 2. Clean up unnecessary, idle workers. 3. Check to see if the database says this node is running any tasks that aren't actually running. If so, reap them. IMPORTANT: this function is one of the few places in the dispatcher (aside from setting lookups) where we talk to the database. As such, if there's an outage, this method _can_ throw various django.db.utils.Error exceptions. Act accordingly. """ orphaned = [] for w in self.workers[::]: if not w.alive: # the worker process has exited # 1. take the task it was running and enqueue the error # callbacks # 2. take any pending tasks delivered to its queue and # send them to another worker logger.error('worker pid:{} is gone (exit={})'.format( w.pid, w.exitcode)) if w.current_task: if w.current_task != 'QUIT': try: for j in UnifiedJob.objects.filter( celery_task_id=w.current_task['uuid']): reaper.reap_job(j, 'failed') except Exception: logger.exception( 'failed to reap job UUID {}'.format( w.current_task['uuid'])) orphaned.extend(w.orphaned_tasks) self.workers.remove(w) elif w.idle and len(self.workers) > self.min_workers: # the process has an empty queue (it's idle) and we have # more processes in the pool than we need (> min) # send this process a message so it will exit gracefully # at the next opportunity logger.debug('scaling down worker pid:{}'.format(w.pid)) w.quit() self.workers.remove(w) if w.alive: # if we discover a task manager invocation that's been running # too long, reap it (because otherwise it'll just hold the postgres # advisory lock forever); the goal of this code is to discover # deadlocks or other serious issues in the task manager that cause # the task manager to never do more work current_task = w.current_task if current_task and isinstance(current_task, dict): endings = [ 'tasks.task_manager', 'tasks.dependency_manager', 'tasks.workflow_manager' ] current_task_name = current_task.get('task', '') if any(current_task_name.endswith(e) for e in endings): if 'started' not in current_task: w.managed_tasks[ current_task['uuid']]['started'] = time.time() age = time.time() - current_task['started'] w.managed_tasks[current_task['uuid']]['age'] = age if age > (settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD): logger.error( f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}' ) # noqa os.kill(w.pid, signal.SIGTERM) for m in orphaned: # if all the workers are dead, spawn at least one if not len(self.workers): self.up() idx = random.choice(range(len(self.workers))) self.write(idx, m) # if the database says a job is running on this node, but it's *not*, # then reap it running_uuids = [] for worker in self.workers: worker.calculate_managed_tasks() running_uuids.extend(list(worker.managed_tasks.keys())) reaper.reap(excluded_uuids=running_uuids)