def test_handle_successful_pull_task(self): """Tests handling the Docker pull task successfully""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule pull task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertFalse(node._is_image_pulled) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._is_image_pulled) # Node should now be ready self.assertEqual(node._state, Node.READY)
def test_handle_killed_pull_task(self): """Tests handling killed cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled)
def test_handle_failed_health_task_low_docker_space(self): """Tests handling a failed health task where Docker has low disk space""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with low Docker space exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.LOW_DOCKER_SPACE_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in node._conditions._active_errors)
def test_handle_successful_health_task(self): """Tests handling the health task successfully""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule health task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal)
def test_handle_failed_pull_task(self): """Tests handling failed Docker pull task""" when = now() node = Node(self.node_agent, self.node, self.scheduler) node._last_health_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No new pull task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._is_image_pulled) # After error threshold, we should get new pull task new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta(seconds=5) node._last_health_task = new_time # Get rid of health check task task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
def test_handle_failed_health_task_bad_logstash(self): """Tests handling a failed health task where logstash is unreachable""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with bad logstash exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.BAD_LOGSTASH_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in node._conditions._active_errors)
def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # No task since there are no job executions to clean self.assertListEqual([], node.get_next_tasks(when)) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node.add_job_execution(job_exe) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No task since all job executions have been cleaned self.assertListEqual([], node.get_next_tasks(when))
def test_generate_status_json(self, mock_now): """Tests calling generate_status_json() successfully""" right_now = now() mock_now.return_value = right_now num_job_exes = JOB_EXES_WARNING_THRESHOLD + 1 node = Node(self.node_agent, self.node, self.scheduler) node._conditions.handle_pull_task_failed() node._conditions.update_cleanup_count(num_job_exes) node._update_state() nodes_list = [] node.generate_status_json(nodes_list) expected_results = [{'id': node.id, 'hostname': node.hostname, 'agent_id': self.node_agent, 'is_active': True, 'state': {'name': 'DEGRADED', 'title': Node.DEGRADED.title, 'description': Node.DEGRADED.description}, 'errors': [{'name': 'IMAGE_PULL', 'title': NodeConditions.IMAGE_PULL_ERR.title, 'description': NodeConditions.IMAGE_PULL_ERR.description, 'started': datetime_to_string(right_now), 'last_updated': datetime_to_string(right_now)}], 'warnings': [{'name': 'CLEANUP', 'title': NodeConditions.CLEANUP_WARNING.title, 'description': NodeConditions.CLEANUP_WARNING.description % num_job_exes, 'started': datetime_to_string(right_now), 'last_updated': datetime_to_string(right_now)}]}] self.assertListEqual(nodes_list, expected_results)
def test_paused_node_pull_task(self): """Tests not returning pull task when its node is paused""" when = now() paused_node = node_test_utils.create_node(hostname='host_1_paused', slave_id='agent_paused') paused_node.is_paused = True node = Node('agent_paused', paused_node, self.scheduler) node._last_health_task = when node._initial_cleanup_completed() node._update_state() tasks = node.get_next_tasks(when) # No task due to paused node self.assertListEqual([], tasks)
def test_handle_lost_pull_task(self): """Tests handling lost pull task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task with scheduling and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task after running and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled)
def test_handle_lost_health_task(self): """Tests handling lost health task""" when = now() node = Node(self.node_agent, self.node, self.scheduler) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get different task next time update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task with scheduling and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal)
def test_handle_failed_health_task(self): """Tests handling failed health task""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in node._conditions._active_errors) # No new health task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._conditions.is_health_check_normal) # After error threshold, we should get new health task new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta( seconds=5) task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
class TestOfferManager(TestCase): def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self.node_agent = 'agent_1' self.node_agent_paused = 'agent_paused' self.node_model = node_test_utils.create_node(slave_id=self.node_agent) self.node = Node(self.node_agent, self.node_model) self.node._is_image_pulled = True self.node._initial_cleanup_completed() self.node._update_state() self.paused_node_model = node_test_utils.create_node( slave_id=self.node_agent_paused) self.paused_node_model.is_paused = True self.paused_node = Node(self.node_agent_paused, self.paused_node_model) self.running_job_exe_1 = job_test_utils.create_job_exe( status='RUNNING', node=self.paused_node_model) self.running_job_exe_1.cpus_scheduled = 2.0 self.running_job_exe_1.mem_scheduled = 512.0 self.running_job_exe_1.disk_in_scheduled = 100.0 self.running_job_exe_1.disk_out_scheduled = 200.0 self.running_job_exe_1.disk_total_scheduled = 300.0 self.running_job_exe_2 = job_test_utils.create_job_exe( status='RUNNING', node=self.node_model) self.running_job_exe_2.cpus_scheduled = 2.0 self.running_job_exe_2.mem_scheduled = 512.0 self.running_job_exe_2.disk_in_scheduled = 100.0 self.running_job_exe_2.disk_out_scheduled = 200.0 self.running_job_exe_2.disk_total_scheduled = 300.0 self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) self.queue_high_cpus = queue_test_utils.create_queue( cpus_required=200.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_high_mem = queue_test_utils.create_queue( cpus_required=2.0, mem_required=10240.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_high_disk = queue_test_utils.create_queue( cpus_required=2.0, mem_required=1024.0, disk_in_required=10000.0, disk_out_required=20000.0, disk_total_required=30000.0) def test_no_ready_offers(self): """Tests considering job executions when no offers are ready""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.NODE_NOT_READY) def test_offers_with_no_nodes(self): """Tests considering job executions when offers cannot be readied due to no nodes updated""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.NODE_NOT_READY) def test_accepted(self): """Tests accepting a running and queued job execution and returning the node offers""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.ACCEPTED) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 2) def test_remove_offers(self): """Tests accepting a running and queued job execution and then removing all offers""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.ACCEPTED) manager.remove_offers([offer_2.id, offer_1.id]) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0) def test_lost_node(self): """Tests accepting a running and queued job execution and then the node being lost""" offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.ACCEPTED) manager.lost_node(self.node_agent) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0) def test_lost_node_that_comes_back(self): """Tests that when a lost name comes back, it can schedule tasks again""" offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node]) manager.ready_new_offers() # Node goes down and comes back up with new agent ID manager.lost_node(self.node_agent) new_node_agent = 'i_am_a_new_node_agent' self.node.update_from_mesos(agent_id=new_node_agent) job_exe_1 = QueuedJobExecution(self.queue_1) # Offers for previous agent should be gone, do not schedule the job exe result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) offer_3 = ResourceOffer( 'offer_3', new_node_agent, NodeResources(cpus=35.0, mem=3048.0, disk=3048.0)) manager.add_new_offers([offer_3]) manager.update_nodes([self.node]) manager.ready_new_offers() # New offers have come in for new agent ID, should schedule job exe now result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 1) def test_all_offers_paused(self): """Tests rejecting a queued job execution due to all nodes being paused""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent_paused, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0) def test_high_cpus(self): """Tests rejecting a queued job execution due to too many CPUs required""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_high_cpus) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NOT_ENOUGH_CPUS) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0) def test_high_mem(self): """Tests rejecting a queued job execution due to too much memory required""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_high_mem) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NOT_ENOUGH_MEM) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0) def test_high_disk(self): """Tests rejecting a queued job execution due to too much disk required""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_high_disk) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NOT_ENOUGH_DISK) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0)
class TestNodeOffers(TestCase): def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self.node_agent = 'agent_1' self.node_agent_paused = 'agent_paused' self.node_model = node_test_utils.create_node(slave_id=self.node_agent) self.node = Node(self.node_agent, self.node_model) self.node._is_image_pulled = True self.node._initial_cleanup_completed() self.node._update_state() self.paused_node_model = node_test_utils.create_node(slave_id=self.node_agent_paused) self.paused_node_model.is_paused = True self.paused_node = Node(self.node_agent_paused, self.paused_node_model) self.running_job_exe_1 = job_test_utils.create_job_exe(status='RUNNING') self.running_job_exe_1.cpus_scheduled = 2.0 self.running_job_exe_1.mem_scheduled = 512.0 self.running_job_exe_1.disk_in_scheduled = 100.0 self.running_job_exe_1.disk_out_scheduled = 200.0 self.running_job_exe_1.disk_total_scheduled = 300.0 self.running_job_exe_2 = job_test_utils.create_job_exe(status='RUNNING') self.running_job_exe_2.cpus_scheduled = 4.0 self.running_job_exe_2.mem_scheduled = 1024.0 self.running_job_exe_2.disk_in_scheduled = 500.0 self.running_job_exe_2.disk_out_scheduled = 50.0 self.running_job_exe_2.disk_total_scheduled = 550.0 self.running_job_exe_high_cpus = job_test_utils.create_job_exe(status='RUNNING') self.running_job_exe_high_cpus.cpus_scheduled = 200.0 self.running_job_exe_high_cpus.mem_scheduled = 512.0 self.running_job_exe_high_cpus.disk_in_scheduled = 100.0 self.running_job_exe_high_cpus.disk_out_scheduled = 200.0 self.running_job_exe_high_cpus.disk_total_scheduled = 300.0 self.running_job_exe_high_mem = job_test_utils.create_job_exe(status='RUNNING') self.running_job_exe_high_mem.cpus_scheduled = 2.0 self.running_job_exe_high_mem.mem_scheduled = 1048576.0 self.running_job_exe_high_mem.disk_in_scheduled = 100.0 self.running_job_exe_high_mem.disk_out_scheduled = 200.0 self.running_job_exe_high_mem.disk_total_scheduled = 300.0 self.running_job_exe_high_disk = job_test_utils.create_job_exe(status='RUNNING') self.running_job_exe_high_disk.cpus_scheduled = 2.0 self.running_job_exe_high_disk.mem_scheduled = 512.0 self.running_job_exe_high_disk.disk_in_scheduled = 10000.0 self.running_job_exe_high_disk.disk_out_scheduled = 20000.0 self.running_job_exe_high_disk.disk_total_scheduled = 30000.0 self.queue_1 = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) self.queue_high_cpus = queue_test_utils.create_queue(cpus_required=200.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_high_mem = queue_test_utils.create_queue(cpus_required=2.0, mem_required=10240.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_high_disk = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0, disk_in_required=10000.0, disk_out_required=20000.0, disk_total_required=30000.0) def test_adding_offers(self): """Tests adding offer and checking the results""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) node_offers.add_offer(offer_1) # Add same offer twice, should ignore offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=5.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) offer_3 = ResourceOffer('offer_3', self.node_agent, NodeResources(cpus=3.0, mem=512.0, disk=1024.0)) node_offers.add_offer(offer_3) # Does not get added into total offer_4 = ResourceOffer('offer_4', 'bad_agent', NodeResources(cpus=1.0, mem=512.0, disk=1024.0)) node_offers.add_offer(offer_4) self.assertEqual(node_offers._available_cpus, 10.0) self.assertEqual(node_offers._available_mem, 3584.0) self.assertEqual(node_offers._available_disk, 4096.0) def test_consider_new_job_exe(self): """Tests consider_new_job_exe() and get_accepted_new_job_exes()""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) result = node_offers.consider_new_job_exe(job_exe_1) # Same job_exe, should have no effect self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_high_cpus = QueuedJobExecution(self.queue_high_cpus) result = node_offers.consider_new_job_exe(job_exe_high_cpus) self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS) job_exe_high_mem = QueuedJobExecution(self.queue_high_mem) result = node_offers.consider_new_job_exe(job_exe_high_mem) self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM) job_exe_high_disk = QueuedJobExecution(self.queue_high_disk) result = node_offers.consider_new_job_exe(job_exe_high_disk) self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK) job_exe_2 = QueuedJobExecution(self.queue_2) result = node_offers.consider_new_job_exe(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_new_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_new_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertEqual(node_offers._available_cpus, 64.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2327.0) def test_consider_next_task(self): """Tests consider_next_task() and get_accepted_running_job_exes()""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) result = node_offers.consider_next_task(job_exe_1) # Same job_exe, should have no effect self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus) result = node_offers.consider_next_task(job_exe_high_cpus) self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS) job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem) result = node_offers.consider_next_task(job_exe_high_mem) self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM) job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk) result = node_offers.consider_next_task(job_exe_high_disk) self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0) def test_paused_node(self): """Tests adding job executions when the node is paused""" node_offers = NodeOffers(self.paused_node) offer_1 = ResourceOffer('offer_1', self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Ensure it accepts new tasks for already running job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) # Don't accept new job executions while paused job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NODE_NOT_READY) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0) def test_lost_node(self): """Tests when the node is lost""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Accept a couple job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertGreater(node_offers._available_cpus, 0.0) self.assertGreater(node_offers._available_mem, 0.0) self.assertGreater(node_offers._available_disk, 0.0) # Node is lost node_offers.lost_node() self.assertFalse(node_offers.has_accepted_job_exes()) self.assertEqual(node_offers._available_cpus, 0.0) self.assertEqual(node_offers._available_mem, 0.0) self.assertEqual(node_offers._available_disk, 0.0) def test_no_offers(self): """Tests adding job executions when there are no offers""" node_offers = NodeOffers(self.node) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.NO_OFFERS) job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NO_OFFERS) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) def test_job_exe_canceled(self): """Tests adding a job execution that becomes canceled while scheduling""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) job_exe_1.execution_canceled() result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.TASK_INVALID) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) def test_remove_offer(self): """Tests remove_offer()""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) job_exe_1 = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) # Remove one offer, new job execution should still be accepted node_offers.remove_offer(offer_1.id) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [job_exe_1]) # Remove second offer, no resources left, all job executions should be removed node_offers.remove_offer(offer_2.id) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 0.0) self.assertEqual(node_offers._available_mem, 0.0) self.assertEqual(node_offers._available_disk, 0.0)