def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'])
def handle_worker_heartbeat(worker_name): """ This is a generic function for updating worker heartbeat records. Existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info level is also done. :param worker_name: The hostname of the worker :type worker_name: basestring """ start = datetime.utcnow() existing_worker = Worker.objects(name=worker_name).first() if not existing_worker: msg = _("New worker '%s' discovered") % worker_name _logger.info(msg) timestamp = datetime.utcnow() msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp, name=worker_name) _logger.debug(msg) Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp, upsert=True) if(datetime.utcnow() - start > timedelta(seconds=PULP_PROCESS_HEARTBEAT_INTERVAL)): sec = (datetime.utcnow() - start).total_seconds() msg = _("Worker {name} heartbeat time {time}s exceeds heartbeat interval. Consider " "adjusting the worker_timeout setting.").format(time=sec, name=worker_name) _logger.warn(msg)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id']) # Delete working directory common_utils.delete_worker_working_directory(name)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta( seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" ) % name _logger.debug(msg) if not _first_check: msg = _( "Failover occurred: '%s' is now the primary resource manager" ) % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info( _("Hot spare pulp_resource_manager instance '%(name)s' detected." ) % {'name': name}) _first_check = False time.sleep(constants.PULP_PROCESS_HEARTBEAT_INTERVAL)
def test_deletes_workers(self, mock_worker, mock_delete_worker): mock_worker.objects.all.return_value = [ Worker(name='name1', last_heartbeat=datetime.utcnow() - timedelta(seconds=400)), Worker(name='name2', last_heartbeat=datetime.utcnow()), ] scheduler.CeleryProcessTimeoutMonitor().check_celery_processes() # make sure _delete_worker is only called for the old worker mock_delete_worker.assert_has_calls([mock.call('name1')])
def test_deletes_workers(self, mock_worker, mock_delete_worker): mock_worker.objects.return_value = [ Worker('name1', datetime.utcnow()), Worker('name2', datetime.utcnow()), ] scheduler.WorkerTimeoutMonitor().check_workers() # make sure _delete_worker is only called for the two expected calls mock_delete_worker.assert_has_calls([mock.call('name1'), mock.call('name2')])
def test_resource_not_in_resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two workers worker_1 = Worker(WORKER_1, datetime.utcnow()) worker_1.save() worker_2 = Worker(WORKER_2, datetime.utcnow()) worker_2.save() # Set up two resource reservations, using our workers from above reserved_resource_1 = ReservedResource(str(uuid.uuid4()), worker_1.name, 'resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(str(uuid.uuid4()), worker_2.name, 'resource_2') reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the Worker # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the workers collection has not been altered self.assertEqual(Worker.objects().count(), 2) worker_1 = Worker.objects().get(name=worker_1.name) self.assertTrue(worker_1) worker_2 = Worker.objects().get(name=worker_2.name) self.assertTrue(worker_2) # Make sure that the reserved resources collection has not been altered self.assertEqual(ReservedResource.objects.count(), 2) rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1') rr_2 = ReservedResource.objects.get(task_id=reserved_resource_2.task_id) self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name) self.assertEqual(rr_2['resource_id'], 'resource_2')
def test_logs_resource_manager_missing(self, mock__logger, mock_worker, mock_delete_worker): mock_worker.objects.all.return_value = [ Worker(name=constants.SCHEDULER_WORKER_NAME, last_heartbeat=datetime.utcnow()), Worker(name='name2', last_heartbeat=datetime.utcnow()), ] scheduler.CeleryProcessTimeoutMonitor().check_celery_processes() mock__logger.error.assert_called_once_with( 'There are 0 pulp_resource_manager processes running. Pulp will not operate ' 'correctly without at least one pulp_resource_mananger process running.')
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta(seconds=PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.debug(msg) if not _first_check: msg = _("Failover occurred: '%s' is now the primary resource manager") % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info(_("Hot spare pulp_resource_manager instance '%(name)s' detected.") % {'name': name}) _first_check = False time.sleep(PULP_PROCESS_HEARTBEAT_INTERVAL)
def test_resource_in_resource_map(self): """ Test _release_resource() with a valid resource. This should remove the resource from the database. """ # Set up two workers now = datetime.utcnow() worker_1 = Worker(name=WORKER_1, last_heartbeat=now) worker_1.save() worker_2 = Worker(name=WORKER_2, last_heartbeat=now) worker_2.save() # Set up two reserved resources reserved_resource_1 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_1.name, resource_id='resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_2.name, resource_id='resource_2') reserved_resource_2.save() # This should remove resource_2 from the _resource_map. tasks._release_resource(reserved_resource_2.task_id) # resource_2 should have been removed from the database self.assertEqual(ReservedResource.objects.count(), 1) rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1')
def test_debug_logging(self, mock__logger, mock_worker, mock_delete_worker): mock_worker.objects.all.return_value = [ Worker(name='name1', last_heartbeat=datetime.utcnow() - timedelta(seconds=400)), Worker(name='name2', last_heartbeat=datetime.utcnow()), Worker(name=RESOURCE_MANAGER_WORKER_NAME, last_heartbeat=datetime.utcnow()), Worker(name=SCHEDULER_WORKER_NAME, last_heartbeat=datetime.utcnow()), ] scheduler.CeleryProcessTimeoutMonitor().check_celery_processes() mock__logger.debug.assert_has_calls([ mock.call('Checking if pulp_workers, pulp_celerybeat, or ' 'pulp_resource_manager processes are missing for more than 300 seconds'), mock.call('1 pulp_worker processes, 1 pulp_celerybeat processes, ' 'and 1 pulp_resource_manager processes') ])
def test_get_worker_for_reservation_breaks_out_of_loop(self): self.mock_get_worker_for_reservation.return_value = Worker( 'worker1', datetime.utcnow()) tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2}) self.assertTrue(not self.mock_get_unreserved_worker.called) self.assertTrue(not self.mock_time.sleep.called)
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict((worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def test_dispatches_inner_task(self): self.mock_get_worker_for_reservation.return_value = Worker( name='worker1', last_heartbeat=datetime.utcnow()) tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2}) apply_async = self.mock_celery.tasks['task_name'].apply_async apply_async.assert_called_once_with(1, 2, a=2, routing_key='worker1', task_id='my_task_id', exchange='C.dq')
def test_dispatches__release_resource(self): self.mock_get_worker_for_reservation.return_value = Worker( name='worker1', last_heartbeat=datetime.utcnow()) tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2}) self.mock__release_resource.apply_async.assert_called_once_with( ('my_task_id', ), routing_key='worker1', exchange='C.dq')
def _get_unreserved_worker(): """ Return the Worker instance that has no reserved_resource entries associated with it. If there are no unreserved workers a pulp.server.exceptions.NoWorkers exception is raised. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :returns: The Worker instance that has no reserved_resource entries associated with it. :rtype: pulp.server.db.model.resources.Worker """ # Build a mapping of queue names to Worker objects workers_dict = dict( (worker['name'], worker) for worker in Worker.objects()) worker_names = workers_dict.keys() reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()] # Find an unreserved worker using set differences of the names, and filter # out workers that should not be assigned work. # NB: this is a little messy but set comprehensions are in python 2.7+ unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names) try: return workers_dict[unreserved_workers.pop()] except KeyError: # All workers are reserved raise NoWorkers()
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker(worker.name)
def test_update_repo_and_plugins(self, distributor_update, mock_get_worker_for_reservation): """ Tests the aggregate call to update a repo and its plugins. """ mock_get_worker_for_reservation.return_value = Worker( 'some_queue', datetime.datetime.now()) self.manager.create_repo('repo-1', 'Original', 'Original Description') importer_manager = manager_factory.repo_importer_manager() distributor_manager = manager_factory.repo_distributor_manager() importer_manager.set_importer('repo-1', 'mock-importer', {'key-i1': 'orig-1'}) distributor_manager.add_distributor('repo-1', 'mock-distributor', {'key-d1': 'orig-1'}, True, distributor_id='dist-1') distributor_manager.add_distributor('repo-1', 'mock-distributor', {'key-d2': 'orig-2'}, True, distributor_id='dist-2') # Test repo_delta = {'display_name': 'Updated'} new_importer_config = {'key-i1': 'updated-1', 'key-i2': 'new-1'} new_distributor_configs = { 'dist-1': { 'key-d1': 'updated-1' }, } # only update one of the two distributors result = self.manager.update_repo_and_plugins('repo-1', repo_delta, new_importer_config, new_distributor_configs) self.assertTrue(isinstance(result, TaskResult)) self.assertEquals(None, result.error) repo = result.return_value # Verify self.assertEqual(repo['id'], 'repo-1') self.assertEqual(repo['display_name'], 'Updated') self.assertEqual(repo['description'], 'Original Description') importer = importer_manager.get_importer('repo-1') self.assertEqual(importer['config'], new_importer_config) dist_1 = distributor_manager.get_distributor('repo-1', 'dist-1') self.assertEqual(dist_1['config'], new_distributor_configs['dist-1']) dist_2 = distributor_manager.get_distributor('repo-1', 'dist-2') self.assertEqual(dist_2['config'], {'key-d2': 'orig-2'}) # There should have been a spawned task for the new distributor config expected_task_id = TaskStatus.objects.get( tags='pulp:repository_distributor:dist-1')['task_id'] self.assertEqual(result.spawned_tasks, [{'task_id': expected_task_id}])
def test_get_unreserved_worker_breaks_out_of_loop(self): self.mock_get_worker_for_reservation.side_effect = NoWorkers() self.mock_get_unreserved_worker.return_value = Worker( name='worker1', last_heartbeat=datetime.utcnow()) tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2}) self.assertTrue(not self.mock_time.sleep.called)
def test_creates_and_saves_reserved_resource(self): self.mock_get_worker_for_reservation.return_value = Worker( name='worker1', last_heartbeat=datetime.utcnow()) tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2}) self.mock_reserved_resource.assert_called_once_with(task_id='my_task_id', worker_name='worker1', resource_id='my_resource_id') self.mock_reserved_resource.return_value.save.assert_called_once_with()
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'], revoke_task=False)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. We override the SIGTERM signal handler so that that the worker record will be immediately cleaned up if the process is killed while in this states. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) with custom_sigterm_handler(name): # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" % name) _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _( "Resource manager '%s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %d seconds until " "the lock can be acquired." % (name, constants.CELERY_CHECK_INTERVAL)) _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'], revoke_task=False)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _("Resource manager '%(name)s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %(interval)d seconds " "until the lock can be acquired.") % \ {'name': name, 'interval': constants.CELERY_CHECK_INTERVAL} _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. Then the existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) worker = Worker.objects(name=event_info['worker_name']).first() if not worker: msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) Worker.objects(name=event_info['worker_name']).\ update_one(set__last_heartbeat=event_info['local_received'], upsert=True)
def test_debug_logging(self, mock__logger, mock_worker, mock_delete_worker): combined_delay = constants.PULP_PROCESS_TIMEOUT_INTERVAL + \ constants.PULP_PROCESS_HEARTBEAT_INTERVAL now = datetime.utcnow() mock_worker.objects.all.return_value = [ Worker(name='name1', last_heartbeat=now - timedelta(seconds=combined_delay)), Worker(name='name2', last_heartbeat=now), Worker(name=constants.RESOURCE_MANAGER_WORKER_NAME, last_heartbeat=now), Worker(name=constants.SCHEDULER_WORKER_NAME, last_heartbeat=now), ] scheduler.CeleryProcessTimeoutMonitor().check_celery_processes() mock__logger.debug.assert_has_calls([ mock.call( 'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes are ' 'missing for more than %d seconds' % constants.PULP_PROCESS_TIMEOUT_INTERVAL ), mock.call( '1 pulp_worker processes, 1 pulp_celerybeat processes, ' 'and 1 pulp_resource_manager processes' ) ])
def handle_worker_heartbeat(worker_name): """ This is a generic function for updating worker heartbeat records. Existing Worker objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new Worker entry is created. Logging at the info level is also done. :param worker_name: The hostname of the worker :type worker_name: basestring """ existing_worker = Worker.objects(name=worker_name).first() if not existing_worker: msg = _("New worker '%s' discovered") % worker_name _logger.info(msg) timestamp = datetime.utcnow() msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp, name=worker_name) _logger.debug(msg) Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp, upsert=True)
def get_worker_for_reservation(resource_id): """ Return the Worker instance that is associated with a reservation of type resource_id. If there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers exception is raised. :param resource_id: The name of the resource you wish to reserve for your task. :raises NoWorkers: If all workers have reserved_resource entries associated with them. :type resource_id: basestring :returns: The Worker instance that has a reserved_resource entry of type `resource_id` associated with it. :rtype: pulp.server.db.model.resources.Worker """ reservation = ReservedResource.objects(resource_id=resource_id).first() if reservation: return Worker.objects(name=reservation['worker_name']).first() else: raise NoWorkers()
def get_worker_for_reservation_list(resources): """ Return the Worker instance that is associated with the reservations described by the 'resources' list. This will be either an existing Worker that is dealing with at least one of the specified resources, or an available idle Worker. We sleep and retry the request until it can be fulfilled. :param resources: A list of the names of the resources you wish to reserve for your task. :type resources: list :returns: The Worker instance that has a reserved_resource entry associated with it for each resource in 'resources' :rtype: pulp.server.db.model.resources.Worker """ _logger.debug('get_worker_for_reservation_list [%s]' % resources) # We leave this loop once we find a Worker to return - otherwise, sleep and try again while True: reservation_workers = set([ reservation['worker_name'] for reservation in ReservedResource.objects( resource_id__in=resources) ]) _logger.debug('...num-RR is %d' % len(reservation_workers)) if len(reservation_workers ) == 1: # Exactly one worker holds any of the desired resources _logger.debug('...one-holds') return Worker.objects(name=list(reservation_workers)[0]).first() elif len(reservation_workers ) == 0: # No worker holds any of the desired resources _logger.debug('...zero-holds') try: worker = _get_unreserved_worker() return worker except NoWorkers: _logger.debug('...unresolved NoWorkers - WAIT') pass else: _logger.debug('...multiple-holds - WAIT') time.sleep(0.25)
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _( 'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker(worker.name)
def get(self, request, task_id): """ Return a response containing a single task. :param request: WSGI request object :type request: django.core.handlers.wsgi.WSGIRequest :param task_id: The ID of the task you wish to cancel :type task_id: basestring :return: Response containing a serialized dict of the requested task :rtype : django.http.HttpResponse :raises MissingResource: if task is not found """ try: task = TaskStatus.objects.get(task_id=task_id) except DoesNotExist: raise MissingResource(task_id) task_dict = task_serializer(task) if 'worker_name' in task_dict: queue_name = Worker(name=task_dict['worker_name'], last_heartbeat=datetime.now()).queue_name task_dict.update({'queue': queue_name}) return generate_json_response_with_pulp_encoder(task_dict)
def tearDown(self): Worker.objects().delete() ReservedResource.objects.delete() TaskStatus.objects().delete()
def get_workers(): """ :returns: list of workers with their heartbeats :rtype: list """ return Worker.objects()
def test_resource_not_in_resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two workers worker_1 = Worker(name=WORKER_1, last_heartbeat=datetime.utcnow()) worker_1.save() worker_2 = Worker(name=WORKER_2, last_heartbeat=datetime.utcnow()) worker_2.save() # Set up two resource reservations, using our workers from above reserved_resource_1 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_1.name, resource_id='resource_1') reserved_resource_1.save() reserved_resource_2 = ReservedResource(task_id=str(uuid.uuid4()), worker_name=worker_2.name, resource_id='resource_2') reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the Worker # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the workers collection has not been altered self.assertEqual(Worker.objects().count(), 2) worker_1 = Worker.objects().get(name=worker_1.name) self.assertTrue(worker_1) worker_2 = Worker.objects().get(name=worker_2.name) self.assertTrue(worker_2) # Make sure that the reserved resources collection has not been altered self.assertEqual(ReservedResource.objects.count(), 2) rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id) self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name) self.assertEqual(rr_1['resource_id'], 'resource_1') rr_2 = ReservedResource.objects.get(task_id=reserved_resource_2.task_id) self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name) self.assertEqual(rr_2['resource_id'], 'resource_2')