def babysit(): """ Babysit the workers, updating our tables with information about their queues. """ # Inspect the available workers to build our state variables active_queues = controller.inspect().active_queues() # Now we need the entire list of AvailableQueues from the database, though we only need their # _id and missing_since attributes. This is preferrable to using a Map/Reduce operation to get # Mongo to tell us which workers Celery knows about that aren't found in the database. all_queues_criteria = Criteria(filters={}, fields=('_id', 'missing_since')) all_queues = list(resources.filter_available_queues(all_queues_criteria)) all_queues_set = set([q.name for q in all_queues]) active_queues_set = set() for worker, queues in active_queues.items(): # If this worker is a reserved task worker, let's make sure we know about it in our # available_queues collection, and make sure it is processing a queue with its own name if re.match('^%s' % RESERVED_WORKER_NAME_PREFIX, worker): # Make sure that this worker is subscribed to a queue of his own name. If not, # subscribe him to one if not worker in [queue['name'] for queue in queues]: controller.add_consumer(queue=worker, destination=(worker,)) active_queues_set.add(worker) # Determine which queues are in active_queues_set that aren't in all_queues_set. These are new # workers and we need to add them to the database. for worker in (active_queues_set - all_queues_set): resources.get_or_create_available_queue(worker) # If there are any AvalailableQueues that have their missing_since attribute set and they are # present now, let's set their missing_since attribute back to None. missing_since_queues = set([q.name for q in all_queues if q.missing_since is not None]) for queue in (active_queues_set & missing_since_queues): active_queue = resources.get_or_create_available_queue(queue) active_queue.missing_since = None active_queue.save() # Now we must delete queues for workers that don't exist anymore, but only if they've been # missing for at least five minutes. for queue in (all_queues_set - active_queues_set): active_queue = list(resources.filter_available_queues(Criteria(filters={'_id': queue})))[0] # We only want to delete this queue if it has been missing for at least 5 minutes. If this # AvailableQueue doesn't have a missing_since attribute, that means it has just now gone # missing. Let's mark its missing_since attribute and continue. if active_queue.missing_since is None: active_queue.missing_since = datetime.utcnow() active_queue.save() continue # This queue has been missing for some time. Let's check to see if it's been 5 minutes yet, # and if it has, let's delete it. if active_queue.missing_since < datetime.utcnow() - timedelta(minutes=5): _delete_queue.apply_async(args=(queue,), queue=RESOURCE_MANAGER_QUEUE)
def _release_resource(resource_id): """ Do not queue this task yourself, but always use the _queue_release_resource() task instead. Please see the docblock on that function for an explanation. When a resource-reserving task is complete, this method must be called with the resource_id so that the we know when it is safe to unmap a resource_id from its given queue name. :param resource_id: The resource that is no longer in use :type resource_id: basestring """ try: reserved_resource = ReservedResource(resource_id) reserved_resource.decrement_num_reservations() # Now we need to decrement the AvailabeQueue that the reserved_resource was using. If the # ReservedResource does not exist for some reason, we won't know its assigned_queue, but # these next lines won't execute anyway. # Remove the '.dq' from the queue name to get the worker name worker_name = reserved_resource.assigned_queue.rstrip('.dq') aqc = Criteria(filters={'_id': worker_name}) aq_list = list(resources.filter_available_queues(aqc)) available_queue = aq_list[0] available_queue.decrement_num_reservations() except DoesNotExist: # If we are trying to decrement the count on one of these obejcts, and they don't exist, # that's OK pass
def _reserve_resource(resource_id): """ When you wish you queue a task that needs to reserve a resource, you should make a call to this function() first, queueing it in the RESOURCE_MANAGER_QUEUE. This Task will return the name of the queue you should put your task in. Please be sure to also add a task to run _queue_release_resource() in the same queue name that this function returns to you. It is important that _release_resource() is called after your task is completed, regardless of whether your task completes successfully or not. :param resource_id: The name of the resource you wish to reserve for your task. The system will ensure that no other tasks that want that same reservation will run concurrently with yours. :type resource_id: basestring :return: The name of a queue that you should put your task in :rtype: basestring """ reserved_resource = resources.get_or_create_reserved_resource(resource_id) if reserved_resource.assigned_queue is None: # The assigned_queue will be None if the reserved_resource was just created, so we'll # need to assign a queue to it # get the dedicated queue name by adding '.dq' to the end of the worker name reserved_resource.assigned_queue = resources.get_least_busy_available_queue().name + '.dq' reserved_resource.save() else: # The assigned_queue is set, so we just need to increment the num_reservations on the # reserved resource reserved_resource.increment_num_reservations() # Remove the '.dq' from the queue name to get the worker name worker_name = reserved_resource.assigned_queue.rstrip('.dq') aqc = Criteria(filters={'_id': worker_name}) aq_list = list(resources.filter_available_queues(aqc)) aq_list[0].increment_num_reservations() return reserved_resource.assigned_queue
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Then the existing AvailableQueue objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new AvailableQueue entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return find_worker_criteria = Criteria(filters={'_id': event_info['worker_name']}, fields=('_id', 'last_heartbeat', 'num_reservations')) find_worker_list = list(resources.filter_available_queues(find_worker_criteria)) if find_worker_list: AvailableQueue.get_collection().find_and_modify( query={'_id': event_info['worker_name']}, update={'$set': {'last_heartbeat': event_info['timestamp']}} ) else: new_available_queue = AvailableQueue(event_info['worker_name'], event_info['timestamp']) msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) new_available_queue.save()
def test_filter(self): """ Test a filter operation to make sure the results appear to be correct. """ # Make three queues. We'll filter for two of them. now = datetime.utcnow() aq_1 = AvailableQueue('queue_1', now, 1) aq_1.save() aq_2 = AvailableQueue('queue_2', now, 2) aq_2.save() aq_3 = AvailableQueue('queue_3', now, 3) aq_3.save() criteria = Criteria(filters={'_id': {'$gt': 'queue_1'}}, sort=[('_id', pymongo.ASCENDING)]) aqs = resources.filter_available_queues(criteria) # Let's assert that aqs is a generator, and then let's cast it to a list so it's easier to # test that we got the correct instances back. self.assertEqual(type(aqs), types.GeneratorType) aqs = list(aqs) self.assertEqual(all([isinstance(aq, AvailableQueue) for aq in aqs]), True) self.assertEqual(aqs[0].name, 'queue_2') self.assertEqual(aqs[0].num_reservations, 2) self.assertEqual(aqs[1].name, 'queue_3') self.assertEqual(aqs[1].num_reservations, 3)
def test_criteria_passed_to_mongo(self, get_collection): """ Assert that the Criteria object is passed on to MongoDB. """ criteria = Criteria(filters={'_id': 'some_id'}) aqs = list(resources.filter_available_queues(criteria)) get_collection.return_value.query.assert_called_once_with(criteria) self.assertEqual(aqs, list())
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the AvailableQueues model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_queue task requesting that the resource manager delete the queue and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_criteria = Criteria(filters={'last_heartbeat': {'$lt': oldest_heartbeat_time}}, fields=('_id', 'last_heartbeat', 'num_reservations')) worker_list = list(resources.filter_available_queues(worker_criteria)) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_queue.apply_async(args=(worker.name,), queue=RESOURCE_MANAGER_QUEUE)
def _delete_queue(queue, normal_shutdown=False): """ Delete the AvailableQueue with _id queue from the database. This Task can only safely be performed by the resource manager at this time, so be sure to queue it in the RESOURCE_MANAGER_QUEUE. If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the work did not shut down normally. :param queue: The name of the queue you wish to delete. In the database, the _id field is the name. :type queue: basestring :param normal_shutdown: True if the worker associated with the queue shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ queue_list = list(resources.filter_available_queues(Criteria(filters={'_id': queue}))) if len(queue_list) == 0: # Potentially _delete_queue() may be called with the database not containing any entries. # https://bugzilla.redhat.com/show_bug.cgi?id=1091922 return queue = queue_list[0] if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') % { 'name': queue.name} logger.error(msg) # Cancel all of the tasks that were assigned to this queue for task in TaskStatusManager.find_by_criteria( Criteria( filters={'queue': queue.name, 'state': {'$in': constants.CALL_INCOMPLETE_STATES}})): cancel(task['task_id']) # Finally, delete the queue queue.delete()
def _delete_queue(queue): """ Delete the AvailableQueue with _id queue from the database. This Task can only safely be performed by the resource manager at this time, so be sure to queue it in the RESOURCE_MANAGER_QUEUE. :param queue: The name of the queue you wish to delete. In the database, the _id field is the name. :type queue: basestring """ queue = list(resources.filter_available_queues(Criteria(filters={'_id': queue})))[0] # Cancel all of the tasks that were assigned to this queue msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': queue.name} logger.error(msg) for task in TaskStatusManager.find_by_criteria( Criteria( filters={'queue': queue.name, 'state': {'$in': dispatch_constants.CALL_INCOMPLETE_STATES}})): cancel(task['task_id']) # Finally, delete the queue queue.delete()