def test_update_all_cluster_resources(self): """Tests successfully updating the all cluster resources database in a cluster""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(22048.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) offer_3 = ResourceOffer( 'offer_3', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(225.0), Mem(1024.0), Disk(22048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2, offer_3]) resource_mgr.refresh_agent_resources([], now()) resource_db = ClusterResources.objects.first() self.assertIsNone(resource_db) resource_mgr.update_all_cluster_resources() resource_db = ClusterResources.objects.first() self.assertIsNotNone(resource_db) self.assertEqual(resource_db.mem, 25120.0) self.assertEqual(resource_db.gpus, 0.0) self.assertEqual(resource_db.disk, 25120.0) self.assertEqual(resource_db.cpus, 252.0)
def test_schedule_system_tasks(self): """Tests successfully calling perform_scheduling() when scheduling system tasks""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) # Clear the queue Queue.objects.all().delete() # Set us up to schedule a database update task system_task_mgr._is_db_update_completed = False # Set us up to schedule 2 message handler tasks Scheduler.objects.update(num_message_handlers=2) scheduler_mgr.sync_with_database() scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual( num_tasks, 3) # Schedule database update task and 2 message handler tasks
def test_score_job_exe_for_scheduling_insufficient_resources(self): """Tests calling score_job_exe_for_scheduling() when there are not enough resources to schedule the job""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) task_resources = NodeResources([Cpus(100.0), Mem(500.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) # Allocate 10 CPUs and 50 MiB memory to existing job execution job_exe = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)])) scheduling_node.accept_job_exe_next_task(job_exe, []) # Should have 10 CPUs and 50 MiB memory left, so this job execution is too big queue_model = queue_test_utils.create_queue(cpus_required=15.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) score = scheduling_node.score_job_exe_for_scheduling(job_exe, []) self.assertIsNone(score)
def test_successful_schedule(self): """Tests successfully calling perform_scheduling()""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 2) # Schedule smaller queued job executions # Ensure job execution models are created and queue models are deleted self.assertEqual( JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1) self.assertEqual( JobExecution.objects.filter( job_id=self.queue_large.job_id).count(), 0) self.assertEqual( Queue.objects.filter( id__in=[self.queue_1.id, self.queue_2.id]).count(), 0)
def test_canceled_queue_model(self): """Tests successfully calling perform_scheduling() when a queue model has been canceled""" offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) self.queue_1.is_canceled = True self.queue_1.save() scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 1) # Scheduled non-canceled queued job execution # queue_1 should be canceled, queue_2 should be running, queue should be empty now self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1) self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1) self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 0) # Job execution manager should have a message for the canceled job execution messages = job_exe_mgr.get_messages() found_job_exe_end_message = False for message in messages: if message.type == 'create_job_exe_ends': found_job_exe_end_message = True self.assertTrue(found_job_exe_end_message)
def test_accept_job_exe_next_task_canceled(self): """Tests calling accept_job_exe_next_task() when job exe gets canceled (no next task)""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(10.0), Mem(50.0)]) task_resources = NodeResources() watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) job_exe = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(1.0), Mem(10.0)])) waiting_tasks = [] job_exe.execution_canceled(now()) had_waiting_task = scheduling_node.accept_job_exe_next_task( job_exe, waiting_tasks) self.assertFalse(had_waiting_task) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal( NodeResources([Cpus(10.0), Mem(50.0)]))) self.assertListEqual(waiting_tasks, [])
def test_missing_workspace(self): """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler""" offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) # Add workspaces to the queued jobs queue_1 = Queue.objects.get(id=self.queue_1.id) config = queue_1.get_execution_configuration() config.set_output_workspaces({'my_output': 'my_workspace'}) queue_1.configuration = config.get_dict() queue_1.save() queue_2 = Queue.objects.get(id=self.queue_2.id) config = queue_2.get_execution_configuration() config.set_output_workspaces({'my_output': 'my_workspace'}) queue_2.configuration = config.get_dict() queue_2.save() scheduling_manager = SchedulingManager() # Clear out workspace manager for scheduling with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces: mock_get_workspaces.return_value = {} num_tasks = scheduling_manager.perform_scheduling(self._client, now()) # Nothing should be scheduled self.assertEqual(num_tasks, 0) self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0) self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
def test_accept_node_tasks_insufficient_resources(self): """Tests calling accept_node_tasks() when there are not enough resources""" node = MagicMock() node.hostname = 'host_1' node.id = 1 health_task = HealthTask('1234', 'agent_1') pull_task = PullTask('1234', 'agent_1') node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True node.get_next_tasks = MagicMock() node.get_next_tasks.return_value = [health_task, pull_task] offered_resources = NodeResources([Cpus(0.0), Mem(50.0)]) task_resources = NodeResources() watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) waiting_tasks = [] had_waiting_task = scheduling_node.accept_node_tasks( now(), waiting_tasks) self.assertTrue(had_waiting_task) self.assertEqual(len(scheduling_node.allocated_tasks), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal(offered_resources)) self.assertListEqual(waiting_tasks, [health_task, pull_task])
def test_accept_new_job_exe_gpu_partial_node_other_task(self): """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(1.0)]) task_resources = NodeResources([Gpus(1.0)]) watermark_resources = NodeResources( [Cpus(100.0), Mem(500.0), Gpus(1.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) queue_model = queue_test_utils.create_queue(cpus_required=1.0, mem_required=10.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0, gpus_required=2) job_exe = QueuedJobExecution(queue_model) accepted = scheduling_node.accept_new_job_exe(job_exe) self.assertFalse(accepted)
def test_accept_new_job_exe_no_jobs(self): """Tests calling accept_new_job_exe() when new job exes are not allowed""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = False node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(10.0), Mem(50.0)]) task_resources = NodeResources() watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) queue_model = queue_test_utils.create_queue(cpus_required=1.0, mem_required=10.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) accepted = scheduling_node.accept_new_job_exe(job_exe) self.assertFalse(accepted) self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal( NodeResources([Cpus(10.0), Mem(50.0)]))) self.assertIsNone(job_exe._scheduled_node_id)
def test_max_resources(self): """Tests successfully calculating the max resources in a cluster""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(22048.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) offer_3 = ResourceOffer( 'offer_3', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(225.0), Mem(1024.0), Disk(22048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2, offer_3]) resource_mgr.refresh_agent_resources([], now()) max = resource_mgr.get_max_available_resources() self.assertTrue( max.is_equal( NodeResources([Cpus(250.0), Mem(22048.0), Disk(24096.0)])))
def test_all_available_resources(self): """Tests successfully calculating the available resources in a cluster""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(22048.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) offer_3 = ResourceOffer( 'offer_3', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(225.0), Mem(1024.0), Disk(22048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2, offer_3]) resource_mgr.refresh_agent_resources([], now()) all_available_resources = resource_mgr.get_all_available_resources() self.assertDictEqual(all_available_resources, { 'mem': 25120.0, 'gpus': 0.0, 'disk': 25120.0, 'cpus': 252.0 })
def test_job_type_limit(self): """Tests calling perform_scheduling() with a job type limit""" Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_seed_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id, job_type=job_type_with_limit, node=self.node_1) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([running_job_exe_1], []) offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def test_start_job_exe_tasks(self): """Tests calling start_job_exe_tasks() successfully""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)])) job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(5.0), Mem(25.0)])) scheduling_node.accept_job_exe_next_task(job_exe_1, []) scheduling_node.accept_job_exe_next_task(job_exe_2, []) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 2) job_exe_1.execution_canceled(now( )) # Execution canceled, so it will not have a next task to start scheduling_node.start_job_exe_tasks() self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0) self.assertEqual(len(scheduling_node.allocated_tasks), 1) # Only job_exe_2 had a next task
def test_paused_job_type(self): """Tests calling perform_scheduling() when a job type is paused""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) self.queue_1.job_type.is_paused = True self.queue_1.job_type.save() job_type_mgr.sync_with_database() scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 1) # Schedule queued job execution that is not paused self.assertEqual( JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1) self.assertEqual( Queue.objects.filter( id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
def test_missing_job_types(self): """Tests calling perform_scheduling() when a queued job type has not been synced to the scheduler""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) scheduling_manager = SchedulingManager() # Clear out job type manager for scheduling with patch('scheduler.scheduling.manager.job_type_mgr.get_job_types' ) as mock_get_job_types: mock_get_job_types.return_value = {} num_tasks = scheduling_manager.perform_scheduling( self._client, now()) # Nothing should be scheduled self.assertEqual(num_tasks, 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0) self.assertEqual( Queue.objects.filter( id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
def test_paused_scheduler(self): """Tests calling perform_scheduling() with a paused scheduler""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) Scheduler.objects.update(is_paused=True) scheduler_mgr.sync_with_database() node_mgr.sync_with_database( scheduler_mgr.config) # Updates nodes with paused scheduler system_task_mgr._is_db_update_completed = False # Make sure system tasks don't get scheduled scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0) self.assertEqual( Queue.objects.filter( id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
def test_score_job_exe_for_reservation_insufficient_resources(self): """Tests calling score_job_exe_for_reservation() when there are not enough resources to reserve for the job""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) task = HealthTask( '1234', 'agent_1') # Resources are 0.1 CPUs and 32 MiB memory job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)]), priority=1000) job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(56.0), Mem(15.0)]), priority=100) scheduling_node = SchedulingNode('agent_1', node, [task], [job_exe_1, job_exe_2], resource_set) queue_model_1 = queue_test_utils.create_queue(priority=100, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_1 = QueuedJobExecution(queue_model_1) queue_model_2 = queue_test_utils.create_queue(priority=1000, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_2 = QueuedJobExecution(queue_model_2) scheduling_node.accept_new_job_exe(job_exe_1) scheduling_node.accept_new_job_exe(job_exe_2) # We are going to try to reserve the node for a job execution with priority 120 # Calculate available resources for reservation: # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) - Higher Priority # New Job Exes (8, 40) = 135.9 CPUs, 613 memory # This new job should NOT fit for reservation queue_model = queue_test_utils.create_queue(priority=120, cpus_required=140.0, mem_required=600.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)]) score = scheduling_node.score_job_exe_for_reservation( job_exe, [job_type_resource_1]) self.assertIsNone(score)
def test_get_queued_resources(self): """Tests successfully getting queued resource information""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(22048.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) offer_3 = ResourceOffer( 'offer_3', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(225.0), Mem(1024.0), Disk(22048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2, offer_3]) resource_mgr.refresh_agent_resources([], now()) resource_db = ClusterResources.objects.first() self.assertIsNone(resource_db) resource_mgr.update_all_cluster_resources() resource_db = ClusterResources.objects.first() self.assertIsNotNone(resource_db) self.assertEqual(resource_db.mem, 25120.0) self.assertEqual(resource_db.gpus, 0.0) self.assertEqual(resource_db.disk, 25120.0) self.assertEqual(resource_db.cpus, 252.0) queued_resources = resource_mgr.get_queued_resources() self.assertDictEqual( queued_resources, { "cluster_resources": { 'cpus': 252, 'disk': 25120, 'gpus': 0, 'mem': 25120 }, "queue_lengths": { 'PENDING': 0, 'QUEUED': 3, 'RUNNING': 0 }, "total_resources": { 'PENDING': {}, 'QUEUED': { 'cpus': 3.0, 'mem': 384.0 }, 'RUNNING': {} } })
def test_node_with_new_agent_id(self): """Tests successfully calling perform_scheduling() when a node get a new agent ID""" # Host 2 gets new agent ID of agent_3 node_mgr.lost_node(self.agent_2) node_mgr.register_agents([self.agent_3]) node_mgr.sync_with_database(scheduler_mgr.config) offer = ResourceOffer( 'offer', self.agent_3.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 2) # Schedule both queued job executions # Check that created tasks have the correct agent ID calls = self._client.method_calls # One for checking for driver and second for task launch self.assertEqual(2, len(calls)) # Get tasks off 2nd calls (index mesos_tasks = calls[1][1][1] for mesos_task in mesos_tasks: self.assertEqual(self.agent_3.agent_id, mesos_task['agent_id']['value'])
def __init__(self, resources=None): """Constructor :param resources: The list of node resources :type resources: list """ self._resources = {} # {Name: Resource} if resources: for resource in resources: if resource.resource_type != 'SCALAR': raise ScaleLogicBug( 'Resource type "%s" is not currently supported', resource.resource_type) self._resources[resource.name] = resource # Make sure standard resources are defined if 'cpus' not in self._resources: self._resources['cpus'] = Cpus(0.0) if 'mem' not in self._resources: self._resources['mem'] = Mem(0.0) if 'disk' not in self._resources: self._resources['disk'] = Disk(0.0) if 'gpus' not in self._resources: self._resources['gpus'] = Gpus(0.0)
def create_queue(job_type=None, priority=1, timeout=3600, cpus_required=1.0, mem_required=512.0, disk_in_required=200.0, disk_out_required=100.0, disk_total_required=300.0, gpus_required=0, queued=timezone.now()): """Creates a queue model for unit testing :param job_type: The job type :type job_type: :class:`job.models.JobType` :param priority: The priority :type priority: int :param timeout: The timeout :type timeout: int :param cpus_required: The number of CPUs required :type cpus_required: float :param mem_required: The memory required in MiB :type mem_required: float :param disk_in_required: The input disk space required in MiB :type disk_in_required: float :param disk_out_required: The output disk space required in MiB :type disk_out_required: float :param disk_total_required: The total disk space required in MiB :type disk_total_required: float :param gpus_required: The number of GPUs required :type gpus_required: float :param queued: The time the execution was queued :type queued: :class:`datetime.datetime` """ job = job_test_utils.create_job(job_type=job_type, status='QUEUED') resources = NodeResources([Cpus(cpus_required), Mem(mem_required), Disk(disk_total_required), Gpus(gpus_required)]) return Queue.objects.create(job_type=job.job_type, job=job, exe_num=job.num_exes, priority=priority, timeout=timeout, input_file_size=disk_in_required, interface=job.get_job_interface().get_dict(), configuration=ExecutionConfiguration().get_dict(), resources=resources.get_json().get_dict(), queued=queued)
def setUp(self): django.setup() resource_mgr.clear() self.agent_1 = Agent('agent_1', 'host_1') self.agent_2 = Agent('agent_2', 'host_2') self.framework_id = '1234' offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) resource_mgr.refresh_agent_resources([], now())
def populate_queue_resources(apps, schema_editor): from node.resources.node_resources import NodeResources from node.resources.resource import Cpus, Disk, Mem # Go through all of the queue models and populate their new resources columns Queue = apps.get_model('queue', 'Queue') total_count = Queue.objects.all().count() print 'Populating new resources field for %s queue models' % str( total_count) done_count = 0 batch_size = 1000 while done_count < total_count: percent = (float(done_count) / float(total_count)) * 100.00 print 'Completed %s of %s queue models (%f%%)' % ( done_count, total_count, percent) batch_end = done_count + batch_size for queue in Queue.objects.order_by( 'job_exe_id')[done_count:batch_end]: cpus = queue.cpus_required mem = queue.mem_required disk = queue.disk_total_required resources = NodeResources([Cpus(cpus), Mem(mem), Disk(disk)]) queue.resources = resources.get_json().get_dict() queue.save() done_count += batch_size print 'All %s queue models completed' % str(total_count)
def job_get_resources(self): """Returns the resources required for this job :returns: The required resources :rtype: :class:`node.resources.node_resources.NodeResources` """ resources = self.job_type.get_resources() # Calculate memory required in MiB rounded up to the nearest whole MiB multiplier = self.job_type.mem_mult_required const = self.job_type.mem_const_required disk_in_required = self.disk_in_required if not disk_in_required: disk_in_required = 0.0 memory_mb = long(math.ceil(multiplier * disk_in_required + const)) memory_required = max(memory_mb, MIN_MEM) # Calculate output space required in MiB rounded up to the nearest whole MiB multiplier = self.job_type.disk_out_mult_required const = self.job_type.disk_out_const_required output_size_mb = long(math.ceil(multiplier * disk_in_required + const)) disk_out_required = max(output_size_mb, MIN_DISK) resources.add( NodeResources( [Mem(memory_required), Disk(disk_out_required + disk_in_required)])) return resources
def test_reset_new_job_exes(self): """Tests calling reset_new_job_exes() successfully""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(100.0), Mem(500.0)]) watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) queue_model_1 = queue_test_utils.create_queue(cpus_required=2.0, mem_required=60.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_1 = QueuedJobExecution(queue_model_1) queue_model_2 = queue_test_utils.create_queue(cpus_required=4.5, mem_required=400.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_2 = QueuedJobExecution(queue_model_2) allocated_resources = NodeResources() allocated_resources.add(job_exe_1.required_resources) allocated_resources.add(job_exe_2.required_resources) # Set up node with queued job exes scheduling_node.accept_new_job_exe(job_exe_1) scheduling_node.accept_new_job_exe(job_exe_2) self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 2) self.assertTrue( scheduling_node.allocated_resources.is_equal(allocated_resources)) # Reset queued job exes and check that everything is back to square one scheduling_node.reset_new_job_exes() self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal(offered_resources))
def test_no_default_workspace(self, mock_taskinfo): """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler""" mock_taskinfo.return_value = MagicMock() offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now()) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now()) resource_mgr.add_new_offers([offer_1, offer_2]) # Add output data to the first queued job: # output data + no workspace defined = fail queue_1 = Queue.objects.get(id=self.queue_1.id) queue_1.get_job_interface().definition['output_data'] = [{'name': 'my_output', 'type': 'file'}] config = queue_1.get_execution_configuration() queue_1.configuration = config.get_dict() queue_1.save() # No output data + no workspace = pass queue_2 = Queue.objects.get(id=self.queue_2.id) config = queue_2.get_execution_configuration() queue_2.configuration = config.get_dict() queue_2.save() scheduling_manager = SchedulingManager() # Set a workspace on the manager with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces: mock_get_workspaces.return_value = { 'name': 'my_workspace', 'title': 'My Workspace', 'description': 'My workspaces', 'is_active': True, 'json_config': {'version': '1.0','broker': {'type': 'host','host_path': '/host/path'}}, } num_tasks = scheduling_manager.perform_scheduling(self._driver, now()) # Only queue_2 should be scheduled self.assertEqual(num_tasks, 1) self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1) self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
def setUp(self): django.setup() reset_error_cache() self.framework_id = '1234' Scheduler.objects.initialize_scheduler() Scheduler.objects.update( num_message_handlers=0 ) # Prevent message handler tasks from scheduling self._client = MagicMock() scheduler_mgr.sync_with_database() scheduler_mgr.update_from_mesos(framework_id=self.framework_id) resource_mgr.clear() job_exe_mgr.clear() self.agent_1 = Agent('agent_1', 'host_1') self.agent_2 = Agent('agent_2', 'host_2') self.agent_3 = Agent('agent_3', 'host_2') node_mgr.clear() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) # Ignore initial cleanup, health check, and image pull tasks for node in node_mgr.get_nodes(): node._last_health_task = now() node._initial_cleanup_completed() node._is_image_pulled = True node._update_state() if node.agent_id == 'agent_1': self.node_1_id = node.id cleanup_mgr.update_nodes(node_mgr.get_nodes()) self.node_1 = Node.objects.get(id=self.node_1_id) # Ignore system tasks system_task_mgr._is_db_update_completed = True self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) self.queue_large = queue_test_utils.create_queue( resources=NodeResources([Cpus( 125.0), Mem(12048.0), Disk(12048.0)])) job_type_mgr.sync_with_database()
def test_accept_new_job_exe_gpu_partial_node(self): """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(4.0)]) task_resources = NodeResources() watermark_resources = NodeResources( [Cpus(100.0), Mem(500.0), Gpus(4.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) queue_model = queue_test_utils.create_queue(cpus_required=1.0, mem_required=10.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0, gpus_required=1) job_exe = QueuedJobExecution(queue_model) accepted = scheduling_node.accept_new_job_exe(job_exe) self.assertTrue(accepted) self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 1) # Verify that our greedy GPU allocation logic is working self.assertTrue( scheduling_node.allocated_resources.is_equal( NodeResources([Cpus(1.0), Mem(10.0), Gpus(4.0)]))) self.assertTrue( scheduling_node._remaining_resources.is_equal( NodeResources([Cpus(9.0), Mem(40.0)]))) self.assertEqual(job_exe._scheduled_node_id, node.id)
def test_accept_node_tasks(self): """Tests successfully calling accept_node_tasks()""" node = MagicMock() node.hostname = 'host_1' node.id = 1 health_task = HealthTask('1234', 'agent_1') pull_task = PullTask('1234', 'agent_1') node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True node.get_next_tasks = MagicMock() node.get_next_tasks.return_value = [health_task, pull_task] node_task_resources = NodeResources() node_task_resources.add(health_task.get_resources()) node_task_resources.add(pull_task.get_resources()) offered_resources = NodeResources([Cpus(100.0), Mem(5000.0)]) expected_remaining_resources = NodeResources() expected_remaining_resources.add(offered_resources) expected_remaining_resources.subtract(node_task_resources) task_resources = NodeResources() watermark_resources = NodeResources([Cpus(100.0), Mem(5000.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) waiting_tasks = [] had_waiting_task = scheduling_node.accept_node_tasks( now(), waiting_tasks) self.assertFalse(had_waiting_task) self.assertEqual(len(scheduling_node.allocated_tasks), 2) self.assertTrue( scheduling_node.allocated_resources.is_equal(node_task_resources)) self.assertTrue( scheduling_node._remaining_resources.is_equal( expected_remaining_resources)) self.assertListEqual(waiting_tasks, [])