Example #1
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.

        IMPORTANT: this function is one of the few places in the dispatcher
        (aside from setting lookups) where we talk to the database.  As such,
        if there's an outage, this method _can_ throw various
        django.db.utils.Error exceptions.  Act accordingly.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception('failed to reap job UUID {}'.format(w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.warn('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        reaper.reap(excluded_uuids=running_uuids)
Example #2
0
 def reap_jobs_from_orphaned_instances(self):
     # discover jobs that are in running state but aren't on an execution node
     # that we know about; this is a fairly rare event, but it can occur if you,
     # for example, SQL backup an awx install with running jobs and restore it
     # elsewhere
     for j in UnifiedJob.objects.filter(
         status__in=['pending', 'waiting', 'running'],
     ).exclude(execution_node__in=Instance.objects.values_list('hostname', flat=True)):
         if j.execution_node and not j.is_container_group_task:
             logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}')
             reap_job(j, 'failed')
Example #3
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(
                    w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(
                                    celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception(
                                'failed to reap job UUID {}'.format(
                                    w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.warn('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)
Example #4
0
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(
                    w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(
                                    celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception(
                                'failed to reap job UUID {}'.format(
                                    w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.warn('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        try:
            reaper.reap(excluded_uuids=running_uuids)
        except Exception:
            # we _probably_ failed here due to DB connectivity issues, so
            # don't use our logger (it accesses the database for configuration)
            _, _, tb = sys.exc_info()
            traceback.print_tb(tb)
Example #5
0
File: pool.py Project: timkids/awx
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
        every cluster node heartbeat:

        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
        3.  Check to see if the database says this node is running any tasks
            that aren't actually running.  If so, reap them.

        IMPORTANT: this function is one of the few places in the dispatcher
        (aside from setting lookups) where we talk to the database.  As such,
        if there's an outage, this method _can_ throw various
        django.db.utils.Error exceptions.  Act accordingly.
        """
        orphaned = []
        for w in self.workers[::]:
            if not w.alive:
                # the worker process has exited
                # 1. take the task it was running and enqueue the error
                #    callbacks
                # 2. take any pending tasks delivered to its queue and
                #    send them to another worker
                logger.error('worker pid:{} is gone (exit={})'.format(
                    w.pid, w.exitcode))
                if w.current_task:
                    if w.current_task != 'QUIT':
                        try:
                            for j in UnifiedJob.objects.filter(
                                    celery_task_id=w.current_task['uuid']):
                                reaper.reap_job(j, 'failed')
                        except Exception:
                            logger.exception(
                                'failed to reap job UUID {}'.format(
                                    w.current_task['uuid']))
                orphaned.extend(w.orphaned_tasks)
                self.workers.remove(w)
            elif w.idle and len(self.workers) > self.min_workers:
                # the process has an empty queue (it's idle) and we have
                # more processes in the pool than we need (> min)
                # send this process a message so it will exit gracefully
                # at the next opportunity
                logger.debug('scaling down worker pid:{}'.format(w.pid))
                w.quit()
                self.workers.remove(w)
            if w.alive:
                # if we discover a task manager invocation that's been running
                # too long, reap it (because otherwise it'll just hold the postgres
                # advisory lock forever); the goal of this code is to discover
                # deadlocks or other serious issues in the task manager that cause
                # the task manager to never do more work
                current_task = w.current_task
                if current_task and isinstance(current_task, dict):
                    endings = [
                        'tasks.task_manager', 'tasks.dependency_manager',
                        'tasks.workflow_manager'
                    ]
                    current_task_name = current_task.get('task', '')
                    if any(current_task_name.endswith(e) for e in endings):
                        if 'started' not in current_task:
                            w.managed_tasks[
                                current_task['uuid']]['started'] = time.time()
                        age = time.time() - current_task['started']
                        w.managed_tasks[current_task['uuid']]['age'] = age
                        if age > (settings.TASK_MANAGER_TIMEOUT +
                                  settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD):
                            logger.error(
                                f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}'
                            )  # noqa
                            os.kill(w.pid, signal.SIGTERM)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
            if not len(self.workers):
                self.up()
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

        # if the database says a job is running on this node, but it's *not*,
        # then reap it
        running_uuids = []
        for worker in self.workers:
            worker.calculate_managed_tasks()
            running_uuids.extend(list(worker.managed_tasks.keys()))
        reaper.reap(excluded_uuids=running_uuids)