def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.get_collection().remove({'worker_name': name}) # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id']) # Delete working directory common_utils.delete_worker_working_directory(name)
def test_resource_not_in_resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two workers worker_1 = Worker(WORKER_1, datetime.utcnow()) worker_1.save() worker_2 = Worker(WORKER_2, datetime.utcnow()) worker_2.save() # Set up two resource reservations, using our workers from above reserved_resource_1 = ReservedResource(uuid.uuid4(), worker_1.name, 'resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(uuid.uuid4(), worker_2.name, 'resource_2') reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the Worker # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the workers collection has not been altered self.assertEqual(Worker.objects().count(), 2) worker_1 = Worker.objects().get(name=worker_1.name) self.assertTrue(worker_1) worker_2 = Worker.objects().get(name=worker_2.name) self.assertTrue(worker_2) # Make sure that the reserved resources collection has not been altered rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 2) rr_1 = rrc.find_one({'_id': reserved_resource_1.task_id}) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1') rr_2 = rrc.find_one({'_id': reserved_resource_2.task_id}) self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name) self.assertEqual(rr_2['resource_id'], 'resource_2')
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker(worker.name)
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict((worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [r['worker_name'] for r in ReservedResource.get_collection().find()] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict( (worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [ r['worker_name'] for r in ReservedResource.get_collection().find() ] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. Then the existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) worker = Worker.objects(name=event_info['worker_name']).first() if not worker: msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) Worker.objects(name=event_info['worker_name']).\ update_one(set__last_heartbeat=event_info['timestamp'], upsert=True)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.get_collection().remove({'worker_name': name}) # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id']) # Delete working directory common_utils.delete_worker_working_directory(name)
def get_worker_for_reservation(resource_id): """ Return the Worker instance that is associated with a reservation of type resource_id. If there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers exception is raised. :param resource_id: The name of the resource you wish to reserve for your task. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :type resource_id: basestring :returns: The Worker instance that has a reserved_resource entry of type `resource_id` associated with it. :rtype: pulp.server.db.model.resources.Worker """ reservation = ReservedResource.get_collection().find_one({'resource_id': resource_id}) if reservation: return Worker.objects(name=reservation['worker_name']).first() else: raise NoWorkers()
def get_worker_for_reservation(resource_id): """ Return the Worker instance that is associated with a reservation of type resource_id. If there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers exception is raised. :param resource_id: The name of the resource you wish to reserve for your task. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :type resource_id: basestring :returns: The Worker instance that has a reserved_resource entry of type `resource_id` associated with it. :rtype: pulp.server.db.model.resources.Worker """ reservation = ReservedResource.get_collection().find_one( {'resource_id': resource_id}) if reservation: return Worker.objects(name=reservation['worker_name']).first() else: raise NoWorkers()
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _( 'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker(worker.name)
def tearDown(self): Worker.objects().delete() ReservedResource.get_collection().remove() TaskStatus.objects().delete()
def get_workers(): """ :returns: list of workers with their heartbeats :rtype: list """ return Worker.objects()