def handle_worker_heartbeat(worker_name): """ This is a generic function for updating worker heartbeat records. Existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info level is also done. :param worker_name: The hostname of the worker :type worker_name: basestring """ start = datetime.utcnow() existing_worker = Worker.objects(name=worker_name).first() if not existing_worker: msg = _("New worker '%s' discovered") % worker_name _logger.info(msg) timestamp = datetime.utcnow() msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp, name=worker_name) _logger.debug(msg) Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp, upsert=True) if(datetime.utcnow() - start > timedelta(seconds=PULP_PROCESS_HEARTBEAT_INTERVAL)): sec = (datetime.utcnow() - start).total_seconds() msg = _("Worker {name} heartbeat time {time}s exceeds heartbeat interval. Consider " "adjusting the worker_timeout setting.").format(time=sec, name=worker_name) _logger.warn(msg)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id']) # Delete working directory common_utils.delete_worker_working_directory(name)
def test_resource_not_in_resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two workers worker_1 = Worker(WORKER_1, datetime.utcnow()) worker_1.save() worker_2 = Worker(WORKER_2, datetime.utcnow()) worker_2.save() # Set up two resource reservations, using our workers from above reserved_resource_1 = ReservedResource(str(uuid.uuid4()), worker_1.name, 'resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(str(uuid.uuid4()), worker_2.name, 'resource_2') reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the Worker # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the workers collection has not been altered self.assertEqual(Worker.objects().count(), 2) worker_1 = Worker.objects().get(name=worker_1.name) self.assertTrue(worker_1) worker_2 = Worker.objects().get(name=worker_2.name) self.assertTrue(worker_2) # Make sure that the reserved resources collection has not been altered self.assertEqual(ReservedResource.objects.count(), 2) rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1') rr_2 = ReservedResource.objects.get(task_id=reserved_resource_2.task_id) self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name) self.assertEqual(rr_2['resource_id'], 'resource_2')
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'])
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta( seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" ) % name _logger.debug(msg) if not _first_check: msg = _( "Failover occurred: '%s' is now the primary resource manager" ) % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info( _("Hot spare pulp_resource_manager instance '%(name)s' detected." ) % {'name': name}) _first_check = False time.sleep(constants.PULP_PROCESS_HEARTBEAT_INTERVAL)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta(seconds=PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.debug(msg) if not _first_check: msg = _("Failover occurred: '%s' is now the primary resource manager") % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info(_("Hot spare pulp_resource_manager instance '%(name)s' detected.") % {'name': name}) _first_check = False time.sleep(PULP_PROCESS_HEARTBEAT_INTERVAL)
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict((worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict( (worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker(worker.name)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'], revoke_task=False)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. We override the SIGTERM signal handler so that that the worker record will be immediately cleaned up if the process is killed while in this states. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) with custom_sigterm_handler(name): # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" % name) _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _( "Resource manager '%s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %d seconds until " "the lock can be acquired." % (name, constants.CELERY_CHECK_INTERVAL)) _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'], revoke_task=False)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _("Resource manager '%(name)s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %(interval)d seconds " "until the lock can be acquired.") % \ {'name': name, 'interval': constants.CELERY_CHECK_INTERVAL} _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. Then the existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) worker = Worker.objects(name=event_info['worker_name']).first() if not worker: msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) Worker.objects(name=event_info['worker_name']).\ update_one(set__last_heartbeat=event_info['local_received'], upsert=True)
def test_resource_not_in_resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two workers worker_1 = Worker(name=WORKER_1, last_heartbeat=datetime.utcnow()) worker_1.save() worker_2 = Worker(name=WORKER_2, last_heartbeat=datetime.utcnow()) worker_2.save() # Set up two resource reservations, using our workers from above reserved_resource_1 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_1.name, resource_id='resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_2.name, resource_id='resource_2') reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the Worker # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the workers collection has not been altered self.assertEqual(Worker.objects().count(), 2) worker_1 = Worker.objects().get(name=worker_1.name) self.assertTrue(worker_1) worker_2 = Worker.objects().get(name=worker_2.name) self.assertTrue(worker_2) # Make sure that the reserved resources collection has not been altered self.assertEqual(ReservedResource.objects.count(), 2) rr_1 = ReservedResource.objects.get( task_id=reserved_resource_1.task_id) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1') rr_2 = ReservedResource.objects.get( task_id=reserved_resource_2.task_id) self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name) self.assertEqual(rr_2['resource_id'], 'resource_2')
def handle_worker_heartbeat(worker_name): """ This is a generic function for updating worker heartbeat records. Existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info level is also done. :param worker_name: The hostname of the worker :type worker_name: basestring """ existing_worker = Worker.objects(name=worker_name).first() if not existing_worker: msg = _("New worker '%s' discovered") % worker_name _logger.info(msg) timestamp = datetime.utcnow() msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp, name=worker_name) _logger.debug(msg) Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp, upsert=True)
def get_worker_for_reservation(resource_id): """ Return the Worker instance that is associated with a reservation of type resource_id. If there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers exception is raised. :param resource_id: The name of the resource you wish to reserve for your task. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :type resource_id: basestring :returns: The Worker instance that has a reserved_resource entry of type `resource_id` associated with it. :rtype: pulp.server.db.model.resources.Worker """ reservation = ReservedResource.objects(resource_id=resource_id).first() if reservation: return Worker.objects(name=reservation['worker_name']).first() else: raise NoWorkers()
def get_worker_for_reservation_list(resources): """ Return the Worker instance that is associated with the reservations described by the 'resources' list. This will be either an existing Worker that is dealing with at least one of the specified resources, or an available idle Worker. We sleep and retry the request until it can be fulfilled. :param resources: A list of the names of the resources you wish to reserve for your task. :type resources: list :returns: The Worker instance that has a reserved_resource entry associated with it for each resource in 'resources' :rtype: pulp.server.db.model.resources.Worker """ _logger.debug('get_worker_for_reservation_list [%s]' % resources) # We leave this loop once we find a Worker to return - otherwise, sleep and try again while True: reservation_workers = set([ reservation['worker_name'] for reservation in ReservedResource.objects( resource_id__in=resources) ]) _logger.debug('...num-RR is %d' % len(reservation_workers)) if len(reservation_workers ) == 1: # Exactly one worker holds any of the desired resources _logger.debug('...one-holds') return Worker.objects(name=list(reservation_workers)[0]).first() elif len(reservation_workers ) == 0: # No worker holds any of the desired resources _logger.debug('...zero-holds') try: worker = _get_unreserved_worker() return worker except NoWorkers: _logger.debug('...unresolved NoWorkers - WAIT') pass else: _logger.debug('...multiple-holds - WAIT') time.sleep(0.25)
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _( 'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker(worker.name)
def tearDown(self): Worker.objects().delete() ReservedResource.objects.delete() TaskStatus.objects().delete()
def get_workers(): """ :returns: list of workers with their heartbeats :rtype: list """ return Worker.objects()