def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_criteria = Criteria( filters={'last_heartbeat': { '$lt': oldest_heartbeat_time }}, fields=('_id', 'last_heartbeat', 'num_reservations')) worker_list = list(resources.filter_workers(worker_criteria)) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker.apply_async(args=(worker.name, ), queue=RESOURCE_MANAGER_QUEUE)
def test_filter(self): """ Test a filter operation to make sure the results appear to be correct. """ # Make three workers. We'll filter for two of them. now = datetime.utcnow() kw_1 = Worker('worker_1', now) kw_1.save() kw_2 = Worker('worker_2', now) kw_2.save() kw_3 = Worker('worker_3', now) kw_3.save() criteria = Criteria(filters={'_id': { '$gt': 'worker_1' }}, sort=[('_id', pymongo.ASCENDING)]) workers = resources.filter_workers(criteria) # Let's assert that workers is a generator, and then let's cast it to a list so it's easier # to test that we got the correct instances back. self.assertEqual(type(workers), types.GeneratorType) workers = list(workers) self.assertEqual(all([isinstance(w, Worker) for w in workers]), True) self.assertEqual(workers[0].name, 'worker_2') self.assertEqual(workers[1].name, 'worker_3')
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. Then the existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) find_worker_criteria = Criteria(filters={'_id': event_info['worker_name']}, fields=('_id', 'last_heartbeat')) find_worker_list = list(resources.filter_workers(find_worker_criteria)) if find_worker_list: Worker.get_collection().find_and_modify( query={'_id': event_info['worker_name']}, update={'$set': {'last_heartbeat': event_info['timestamp']}} ) else: new_worker = Worker(event_info['worker_name'], event_info['timestamp']) msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) new_worker.save()
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Then the existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return find_worker_criteria = Criteria(filters={'_id': event_info['worker_name']}, fields=('_id', 'last_heartbeat', 'num_reservations')) find_worker_list = list(resources.filter_workers(find_worker_criteria)) if find_worker_list: Worker.get_collection().find_and_modify( query={'_id': event_info['worker_name']}, update={'$set': { 'last_heartbeat': event_info['timestamp'] }}) else: new_worker = Worker(event_info['worker_name'], event_info['timestamp']) msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) new_worker.save()
def get_workers(): """ :returns: list of workers with their heartbeats :rtype: list """ empty_criteria = Criteria() return resources.filter_workers(empty_criteria)
def test_criteria_passed_to_mongo(self, get_collection): """ Assert that the Criteria object is passed on to MongoDB. """ criteria = Criteria(filters={'_id': 'some_id'}) workers = list(resources.filter_workers(criteria)) get_collection.return_value.query.assert_called_once_with(criteria) self.assertEqual(workers, list())
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. In the database, the _id field is the name. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} logger.error(msg) # Delete the worker document worker_list = list( resources.filter_workers(Criteria(filters={'_id': name}))) if len(worker_list) > 0: worker_document = worker_list[0] worker_document.delete() # Delete all reserved_resource documents for the worker ReservedResource.get_collection().remove({'worker_name': name}) # Cancel all of the tasks that were assigned to this worker's queue worker = Worker.from_bson({'_id': name}) for task in TaskStatusManager.find_by_criteria( Criteria( filters={ 'worker_name': worker.name, 'state': { '$in': constants.CALL_INCOMPLETE_STATES } })): cancel(task['task_id'])
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database. This Task can only safely be performed by the resource manager at this time, so be sure to queue it in the RESOURCE_MANAGER_QUEUE. If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the work did not shut down normally. :param name: The name of the worker you wish to delete. In the database, the _id field is the name. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ worker_list = list( resources.filter_workers(Criteria(filters={'_id': name}))) if len(worker_list) == 0: # Potentially _delete_worker() may be called with the database not containing any entries. # https://bugzilla.redhat.com/show_bug.cgi?id=1091922 return worker = worker_list[0] if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': worker.name} logger.error(msg) # Cancel all of the tasks that were assigned to this worker's queue for task in TaskStatusManager.find_by_criteria( Criteria( filters={ 'queue': worker.queue_name, 'state': { '$in': constants.CALL_INCOMPLETE_STATES } })): cancel(task['task_id']) # Finally, delete the worker worker.delete()
def test_filter(self): """ Test a filter operation to make sure the results appear to be correct. """ # Make three workers. We'll filter for two of them. now = datetime.utcnow() kw_1 = Worker('worker_1', now) kw_1.save() kw_2 = Worker('worker_2', now) kw_2.save() kw_3 = Worker('worker_3', now) kw_3.save() criteria = Criteria(filters={'_id': {'$gt': 'worker_1'}}, sort=[('_id', pymongo.ASCENDING)]) workers = resources.filter_workers(criteria) # Let's assert that workers is a generator, and then let's cast it to a list so it's easier # to test that we got the correct instances back. self.assertEqual(type(workers), types.GeneratorType) workers = list(workers) self.assertEqual(all([isinstance(w, Worker) for w in workers]), True) self.assertEqual(workers[0].name, 'worker_2') self.assertEqual(workers[1].name, 'worker_3')
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_criteria = Criteria(filters={'last_heartbeat': {'$lt': oldest_heartbeat_time}}, fields=('_id', 'last_heartbeat')) worker_list = list(resources.filter_workers(worker_criteria)) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker(worker.name)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. In the database, the _id field is the name. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) # Delete the worker document worker_list = list(resources.filter_workers(Criteria(filters={'_id': name}))) if len(worker_list) > 0: worker_document = worker_list[0] worker_document.delete() # Delete all reserved_resource documents for the worker ReservedResource.get_collection().remove({'worker_name': name}) # Cancel all of the tasks that were assigned to this worker's queue worker = Worker.from_bson({'_id': name}) for task_status in TaskStatus.objects(worker_name=worker.name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'])
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database. This Task can only safely be performed by the resource manager at this time, so be sure to queue it in the RESOURCE_MANAGER_QUEUE. If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the work did not shut down normally. :param name: The name of the worker you wish to delete. In the database, the _id field is the name. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ worker_list = list(resources.filter_workers(Criteria(filters={'_id': name}))) if len(worker_list) == 0: # Potentially _delete_worker() may be called with the database not containing any entries. # https://bugzilla.redhat.com/show_bug.cgi?id=1091922 return worker = worker_list[0] if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': worker.name} logger.error(msg) # Cancel all of the tasks that were assigned to this worker's queue for task in TaskStatusManager.find_by_criteria( Criteria( filters={'queue': worker.queue_name, 'state': {'$in': constants.CALL_INCOMPLETE_STATES}})): cancel(task['task_id']) # Finally, delete the worker worker.delete()