Example #1
0
    def test_score_job_exe_for_reservation_insufficient_resources(self):
        """Tests calling score_job_exe_for_reservation() when there are not enough resources to reserve for the job"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        task = HealthTask(
            '1234', 'agent_1')  # Resources are 0.1 CPUs and 32 MiB memory
        job_exe_1 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]),
            priority=1000)
        job_exe_2 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(56.0), Mem(15.0)]),
            priority=100)
        scheduling_node = SchedulingNode('agent_1', node, [task],
                                         [job_exe_1, job_exe_2], resource_set)
        queue_model_1 = queue_test_utils.create_queue(priority=100,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_1 = QueuedJobExecution(queue_model_1)
        queue_model_2 = queue_test_utils.create_queue(priority=1000,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_2 = QueuedJobExecution(queue_model_2)
        scheduling_node.accept_new_job_exe(job_exe_1)
        scheduling_node.accept_new_job_exe(job_exe_2)

        # We are going to try to reserve the node for a job execution with priority 120
        # Calculate available resources for reservation:
        # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) -  Higher Priority
        # New Job Exes (8, 40) = 135.9 CPUs, 613 memory
        # This new job should NOT fit for reservation
        queue_model = queue_test_utils.create_queue(priority=120,
                                                    cpus_required=140.0,
                                                    mem_required=600.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)
        job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)])

        score = scheduling_node.score_job_exe_for_reservation(
            job_exe, [job_type_resource_1])
        self.assertIsNone(score)
Example #2
0
    def test_consider_new_job_exe(self):
        """Tests consider_new_job_exe() and get_accepted_new_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_new_job_exe(
            job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = QueuedJobExecution(self.queue_high_cpus)
        result = node_offers.consider_new_job_exe(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = QueuedJobExecution(self.queue_high_mem)
        result = node_offers.consider_new_job_exe(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = QueuedJobExecution(self.queue_high_disk)
        result = node_offers.consider_new_job_exe(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = QueuedJobExecution(self.queue_2)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_new_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_new_job_exes()),
                            {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 64.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2327.0)
Example #3
0
    def test_accept_new_job_exe_gpu_partial_node_other_task(self):
        """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(1.0)])
        task_resources = NodeResources([Gpus(1.0)])
        watermark_resources = NodeResources(
            [Cpus(100.0), Mem(500.0), Gpus(1.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0,
                                                    gpus_required=2)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertFalse(accepted)
Example #4
0
    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Example #5
0
    def test_remove_offer(self):
        """Tests remove_offer()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Remove one offer, new job execution should still be accepted
        node_offers.remove_offer(offer_1.id)
        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(),
                             [job_exe_1])

        # Remove second offer, no resources left, all job executions should be removed
        node_offers.remove_offer(offer_2.id)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Example #6
0
    def test_score_job_exe_for_scheduling_insufficient_resources(self):
        """Tests calling score_job_exe_for_scheduling() when there are not enough resources to schedule the job"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        task_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        # Allocate 10 CPUs and 50 MiB memory to existing job execution
        job_exe = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]))
        scheduling_node.accept_job_exe_next_task(job_exe, [])

        # Should have 10 CPUs and 50 MiB memory left, so this job execution is too big
        queue_model = queue_test_utils.create_queue(cpus_required=15.0,
                                                    mem_required=40.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)

        score = scheduling_node.score_job_exe_for_scheduling(job_exe, [])
        self.assertIsNone(score)
Example #7
0
    def _consider_new_job_exes(self):
        """Considers any queued job executions for scheduling
        """

        if self._scheduler_manager.is_paused():
            return

        num_job_exes = 0
        for queue in Queue.objects.get_queue():
            job_type_id = queue.job_type_id

            if job_type_id not in self._job_types or self._job_types[
                    job_type_id].is_paused:
                continue

            if job_type_id in self._job_type_limit_available and self._job_type_limit_available[
                    job_type_id] < 1:
                continue

            queued_job_exe = QueuedJobExecution(queue)
            if self._offer_manager.consider_new_job_exe(
                    queued_job_exe) == OfferManager.ACCEPTED:
                num_job_exes += 1
                if job_type_id in self._job_type_limit_available:
                    self._job_type_limit_available[job_type_id] -= 1
                if num_job_exes >= SchedulingThread.MAX_NEW_JOB_EXES:
                    break
Example #8
0
    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Example #9
0
    def test_accept_new_job_exe_no_jobs(self):
        """Tests calling accept_new_job_exe() when new job exes are not allowed"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = False
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertFalse(accepted)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                NodeResources([Cpus(10.0), Mem(50.0)])))
        self.assertIsNone(job_exe._scheduled_node_id)
Example #10
0
    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_NOT_READY)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
Example #11
0
    def test_reset_new_job_exes(self):
        """Tests calling reset_new_job_exes() successfully"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        queue_model_1 = queue_test_utils.create_queue(cpus_required=2.0,
                                                      mem_required=60.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_1 = QueuedJobExecution(queue_model_1)
        queue_model_2 = queue_test_utils.create_queue(cpus_required=4.5,
                                                      mem_required=400.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_2 = QueuedJobExecution(queue_model_2)
        allocated_resources = NodeResources()
        allocated_resources.add(job_exe_1.required_resources)
        allocated_resources.add(job_exe_2.required_resources)

        # Set up node with queued job exes
        scheduling_node.accept_new_job_exe(job_exe_1)
        scheduling_node.accept_new_job_exe(job_exe_2)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 2)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(allocated_resources))

        # Reset queued job exes and check that everything is back to square one
        scheduling_node.reset_new_job_exes()
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(offered_resources))
Example #12
0
    def test_score_job_exe_for_scheduling(self):
        """Tests calling score_job_exe_for_scheduling() successfully"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        task_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        # Allocate 10 CPUs and 50 MiB memory to existing job execution
        job_exe = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]))
        scheduling_node.accept_job_exe_next_task(job_exe, [])

        # Should have 10 CPUs and 50 MiB memory left, so this should be scheduled
        queue_model = queue_test_utils.create_queue(cpus_required=5.0,
                                                    mem_required=40.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)
        # Expected available 85 CPUs and 110 MiB memory "left" on node
        # (watermark - current tasks - allocated - new job we are scoring)
        # First 2 job types should fit, next 2 are too big, so score should be 2
        job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)])
        job_type_resource_2 = NodeResources([Cpus(85.0), Mem(109.0)])
        job_type_resource_3 = NodeResources([Cpus(86.0), Mem(10.0)])
        job_type_resource_4 = NodeResources([Cpus(2.0), Mem(111.0)])

        score = scheduling_node.score_job_exe_for_scheduling(
            job_exe, [
                job_type_resource_1, job_type_resource_2, job_type_resource_3,
                job_type_resource_4
            ])
        self.assertEqual(score, 2)
Example #13
0
    def test_no_offers(self):
        """Tests adding job executions when there are no offers"""

        node_offers = NodeOffers(self.node)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
Example #14
0
    def test_successful(self):
        """Tests calling QueueManager.schedule_job_executions() successfully."""

        queued_job_exe_1 = QueuedJobExecution(self.queue_1)
        queued_job_exe_1.accepted(
            self.node.id,
            JobResources(cpus=self.queue_1.cpus_required,
                         mem=self.queue_1.mem_required,
                         disk_in=self.queue_1.disk_in_required,
                         disk_out=self.queue_1.disk_out_required,
                         disk_total=self.queue_1.disk_total_required))
        queued_job_exe_2 = QueuedJobExecution(self.queue_2)
        queued_job_exe_2.accepted(
            self.node.id,
            JobResources(cpus=self.queue_2.cpus_required,
                         mem=self.queue_2.mem_required,
                         disk_in=self.queue_2.disk_in_required,
                         disk_out=self.queue_2.disk_out_required,
                         disk_total=self.queue_2.disk_total_required))

        scheduled_job_exes = Queue.objects.schedule_job_executions(
            'framework-123', [queued_job_exe_1, queued_job_exe_2], {})
        # Only job_exe_1 should have scheduled since job_exe_2 has an invalid job type configuration JSON
        self.assertEqual(len(scheduled_job_exes), 1)
        self.assertEqual(scheduled_job_exes[0].id, self.job_exe_1.id)
        # job_exe_1 should be RUNNING
        self.assertEqual(
            Queue.objects.filter(job_exe_id=self.job_exe_1.id).count(), 0)
        self.assertEqual(
            JobExecution.objects.get(id=self.job_exe_1.id).status, 'RUNNING')
        self.assertEqual(
            Job.objects.get(id=self.job_exe_1.job_id).status, 'RUNNING')
        # job_exe_2 should be QUEUED
        self.assertEqual(
            Queue.objects.filter(job_exe_id=self.job_exe_2.id).count(), 1)
        self.assertEqual(
            JobExecution.objects.get(id=self.job_exe_2.id).status, 'QUEUED')
        self.assertEqual(
            Job.objects.get(id=self.job_exe_2.job_id).status, 'QUEUED')
Example #15
0
    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_OFFLINE)
Example #16
0
    def test_high_disk(self):
        """Tests rejecting a queued job execution due to too much disk required"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_disk)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_DISK)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Example #17
0
    def test_all_offers_paused(self):
        """Tests rejecting a queued job execution due to all nodes being paused"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_paused,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Example #18
0
    def test_offers_with_no_nodes(self):
        """Tests considering job executions when offers cannot be readied due to no nodes updated"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_NOT_READY)
Example #19
0
    def test_lost_node_that_comes_back(self):
        """Tests that when a lost name comes back, it can schedule tasks again"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # Node goes down and comes back up with new agent ID
        manager.lost_node(self.node_agent)
        new_node_agent = 'i_am_a_new_node_agent'
        self.node.update_from_mesos(agent_id=new_node_agent)

        job_exe_1 = QueuedJobExecution(self.queue_1)

        # Offers for previous agent should be gone, do not schedule the job exe
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        offer_3 = ResourceOffer(
            'offer_3', new_node_agent,
            NodeResources(cpus=35.0, mem=3048.0, disk=3048.0))
        manager.add_new_offers([offer_3])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # New offers have come in for new agent ID, should schedule job exe now
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 1)
Example #20
0
    def test_accept_new_job_exe_gpu_partial_node(self):
        """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(4.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources(
            [Cpus(100.0), Mem(500.0), Gpus(4.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0,
                                                    gpus_required=1)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertTrue(accepted)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 1)
        # Verify that our greedy GPU allocation logic is working
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(
                NodeResources([Cpus(1.0), Mem(10.0),
                               Gpus(4.0)])))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                NodeResources([Cpus(9.0), Mem(40.0)])))
        self.assertEqual(job_exe._scheduled_node_id, node.id)
Example #21
0
    def _process_queue(self, nodes, job_types, job_type_limits,
                       job_type_resources, workspaces):
        """Retrieves the top of the queue and schedules new job executions on available nodes as resources and limits
        allow

        :param nodes: The dict of scheduling nodes stored by node ID for all nodes ready to accept new job executions
        :type nodes: dict
        :param job_types: The dict of job type models stored by job type ID
        :type job_types: dict
        :param job_type_limits: The dict of job type IDs mapping to job type limits
        :type job_type_limits: dict
        :param job_type_resources: The list of all of the job type resource requirements
        :type job_type_resources: list
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: dict
        :returns: The list of queued job executions that were scheduled
        :rtype: list
        """

        scheduled_job_executions = []
        ignore_job_type_ids = self._calculate_job_types_to_ignore(
            job_types, job_type_limits)
        started = now()

        for queue in Queue.objects.get_queue(
                scheduler_mgr.config.queue_mode,
                ignore_job_type_ids)[:QUEUE_LIMIT]:
            job_exe = QueuedJobExecution(queue)

            # Canceled job executions get processed as scheduled executions
            if job_exe.is_canceled:
                scheduled_job_executions.append(job_exe)
                continue

            # If there are no longer any available nodes, break
            if not nodes:
                break

            # Make sure execution's job type and workspaces have been synced to the scheduler
            job_type_id = queue.job_type_id
            if job_type_id not in job_types:
                continue
            workspace_names = job_exe.configuration.get_input_workspace_names()
            workspace_names.extend(
                job_exe.configuration.get_output_workspace_names())
            missing_workspace = False
            for name in workspace_names:
                missing_workspace = missing_workspace or name not in workspaces
            if missing_workspace:
                continue

            # Check limit for this execution's job type
            if job_type_id in job_type_limits and job_type_limits[
                    job_type_id] < 1:
                continue

            # Try to schedule job execution and adjust job type limit if needed
            if self._schedule_new_job_exe(job_exe, nodes, job_type_resources):
                scheduled_job_executions.append(job_exe)
                if job_type_id in job_type_limits:
                    job_type_limits[job_type_id] -= 1

        duration = now() - started
        msg = 'Processing queue took %.3f seconds'
        if duration > PROCESS_QUEUE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())

        return scheduled_job_executions
Example #22
0
    def test_successful_supersede(self):
        """Tests calling QueueManager.queue_new_recipe() successfully when superseding a recipe."""

        # Queue initial recipe and complete its first job
        node = node_test_utils.create_node()
        recipe_id = Queue.objects.queue_new_recipe(self.recipe_type, self.data, self.event)
        recipe = Recipe.objects.get(id=recipe_id)
        recipe_job_1 = RecipeJob.objects.select_related('job__job_exe').get(recipe_id=recipe_id, job_name='Job 1')
        job_exe_1 = JobExecution.objects.get(job_id=recipe_job_1.job_id)
        queued_job_exe = QueuedJobExecution(Queue.objects.get(job_exe_id=job_exe_1.id))
        queued_job_exe.accepted(node, JobResources(cpus=10, mem=1000, disk_in=1000, disk_out=1000, disk_total=2000))
        Queue.objects.schedule_job_executions('123', [queued_job_exe], {})
        results = JobResults()
        results.add_file_list_parameter('Test Output 1', [product_test_utils.create_product().file_id])
        JobExecution.objects.filter(id=job_exe_1.id).update(results=results.get_dict())
        Queue.objects.handle_job_completion(job_exe_1.id, now())

        # Create a new recipe type that has a new version of job 2 (job 1 is identical)
        new_job_type_2 = job_test_utils.create_job_type(name=self.job_type_2.name, version='New Version',
                                                        interface=self.job_type_2.interface)
        new_definition = {
            'version': '1.0',
            'input_data': [{
                'name': 'Recipe Input',
                'type': 'file',
                'media_types': ['text/plain'],
            }],
            'jobs': [{
                'name': 'New Job 1',
                'job_type': {
                    'name': self.job_type_1.name,
                    'version': self.job_type_1.version,
                },
                'recipe_inputs': [{
                    'recipe_input': 'Recipe Input',
                    'job_input': 'Test Input 1',
                }]
            }, {
                'name': 'New Job 2',
                'job_type': {
                    'name': new_job_type_2.name,
                    'version': new_job_type_2.version,
                },
                'dependencies': [{
                    'name': 'New Job 1',
                    'connections': [{
                        'output': 'Test Output 1',
                        'input': 'Test Input 2',
                    }]
                }]
            }]
        }
        new_recipe_type = recipe_test_utils.create_recipe_type(name=self.recipe_type.name, definition=new_definition)
        event = trigger_test_utils.create_trigger_event()
        recipe_job_1 = RecipeJob.objects.select_related('job').get(recipe_id=recipe_id, job_name='Job 1')
        recipe_job_2 = RecipeJob.objects.select_related('job').get(recipe_id=recipe_id, job_name='Job 2')
        superseded_jobs = {'Job 1': recipe_job_1.job, 'Job 2': recipe_job_2.job}
        graph_a = self.recipe_type.get_recipe_definition().get_graph()
        graph_b = new_recipe_type.get_recipe_definition().get_graph()
        delta = RecipeGraphDelta(graph_a, graph_b)

        # Queue new recipe that supersedes the old recipe
        new_recipe_id = Queue.objects.queue_new_recipe(new_recipe_type, None, event, recipe, delta, superseded_jobs)

        # Ensure old recipe is superseded
        recipe = Recipe.objects.get(id=recipe_id)
        self.assertTrue(recipe.is_superseded)

        # Ensure new recipe supersedes old recipe
        new_recipe = Recipe.objects.get(id=new_recipe_id)
        self.assertEqual(new_recipe.superseded_recipe_id, recipe_id)

        # Ensure that job 1 is already completed (it was copied from original recipe) and that job 2 is queued
        new_recipe_job_1 = RecipeJob.objects.select_related('job').get(recipe_id=new_recipe_id, job_name='New Job 1')
        new_recipe_job_2 = RecipeJob.objects.select_related('job').get(recipe_id=new_recipe_id, job_name='New Job 2')
        self.assertEqual(new_recipe_job_1.job.status, 'COMPLETED')
        self.assertFalse(new_recipe_job_1.is_original)
        self.assertEqual(new_recipe_job_2.job.status, 'QUEUED')
        self.assertTrue(new_recipe_job_2.is_original)

        # Complete both the old and new job 2 and check that only the new recipe completes
        job_exe_2 = JobExecution.objects.get(job_id=recipe_job_2.job_id)
        queued_job_exe_2 = QueuedJobExecution(Queue.objects.get(job_exe_id=job_exe_2.id))
        queued_job_exe_2.accepted(node, JobResources(cpus=10, mem=1000, disk_in=1000, disk_out=1000, disk_total=2000))
        Queue.objects.schedule_job_executions('123', [queued_job_exe_2], {})
        Queue.objects.handle_job_completion(job_exe_2.id, now())
        new_job_exe_2 = JobExecution.objects.get(job_id=new_recipe_job_2.job_id)
        new_queued_job_exe_2 = QueuedJobExecution(Queue.objects.get(job_exe_id=new_job_exe_2.id))
        new_queued_job_exe_2.accepted(node, JobResources(cpus=10, mem=1000, disk_in=1000, disk_out=1000,
                                                         disk_total=2000))
        Queue.objects.schedule_job_executions('123', [new_queued_job_exe_2], {})
        Queue.objects.handle_job_completion(new_job_exe_2.id, now())
        recipe = Recipe.objects.get(id=recipe.id)
        new_recipe = Recipe.objects.get(id=new_recipe.id)
        self.assertIsNone(recipe.completed)
        self.assertIsNotNone(new_recipe.completed)
Example #23
0
    def _process_queue(self, nodes, job_types, job_type_limits,
                       job_type_resources, workspaces):
        """Retrieves the top of the queue and schedules new job executions on available nodes as resources and limits
        allow

        :param nodes: The dict of scheduling nodes stored by node ID for all nodes ready to accept new job executions
        :type nodes: dict
        :param job_types: The dict of job type models stored by job type ID
        :type job_types: dict
        :param job_type_limits: The dict of job type IDs mapping to job type limits
        :type job_type_limits: dict
        :param job_type_resources: The list of all of the job type resource requirements
        :type job_type_resources: list
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: dict
        :returns: The list of queued job executions that were scheduled
        :rtype: list
        """

        scheduled_job_executions = []
        started = now()
        type_warnings = {}

        # We can schedule as long as there are nodes
        if not nodes:
            logger.warning(
                'There are no nodes available. Waiting to schedule until there are free resources...'
            )
            return scheduled_job_executions

        ignore_job_type_ids = self._calculate_job_types_to_ignore(
            job_types, job_type_limits)
        max_cluster_resources = resource_mgr.get_max_available_resources()
        for queue in Queue.objects.get_queue(
                scheduler_mgr.config.queue_mode,
                ignore_job_type_ids)[:QUEUE_LIMIT]:
            job_exe = QueuedJobExecution(queue)

            # Canceled job executions get processed as scheduled executions
            if job_exe.is_canceled:
                scheduled_job_executions.append(job_exe)
                continue

            jt = job_type_mgr.get_job_type(queue.job_type.id)
            name = INVALID_RESOURCES.name + jt.name
            title = INVALID_RESOURCES.title % jt.name
            warning = SchedulerWarning(name=name,
                                       title=title,
                                       description=None)
            if jt.unmet_resources and scheduler_mgr.is_warning_active(warning):
                # previously checked this job type and found we lacked resources; wait until warning is inactive to check again
                continue

            invalid_resources = []
            insufficient_resources = []
            # get resource names offered and compare to job type resources
            for resource in job_exe.required_resources.resources:
                # Check for invalid resource or sharedmem
                if (resource.name not in max_cluster_resources._resources) or (
                        resource.name.lower() == 'sharedmem'):
                    # Skip sharedmem if its 0
                    if (resource.name.lower()
                            == 'sharedmem') and (resource.value <= 0):
                        continue
                    if jt.name in type_warnings:
                        type_warnings[jt.name]['count'] += 1
                        if resource.name not in type_warnings[
                                jt.name]['warning']:
                            type_warnings[jt.name]['warning'] += (
                                ', %s' % resource.name)
                    else:
                        type_warnings[jt.name] = {
                            'warning':
                            '%s job types could not be scheduled as the following resources do not exist in the available cluster resources: %s'
                            % (jt.name, resource.name),
                            'count':
                            1
                        }
                    # resource does not exist in cluster
                    invalid_resources.append(resource.name)
                elif resource.value > max_cluster_resources._resources[
                        resource.name].value:
                    # resource exceeds the max available from any node
                    insufficient_resources.append(resource.name)

            if invalid_resources:
                description = INVALID_RESOURCES.description % invalid_resources
                scheduler_mgr.warning_active(warning, description)

            if insufficient_resources:
                description = INSUFFICIENT_RESOURCES.description % insufficient_resources
                scheduler_mgr.warning_active(warning, description)

            if invalid_resources or insufficient_resources:
                invalid_resources.extend(insufficient_resources)
                jt.unmet_resources = ','.join(invalid_resources)
                jt.save(update_fields=["unmet_resources"])
                continue
            else:
                # reset unmet_resources flag
                jt.unmet_resources = None
                scheduler_mgr.warning_inactive(warning)
                jt.save(update_fields=["unmet_resources"])

            # Make sure execution's job type and workspaces have been synced to the scheduler
            job_type_id = queue.job_type_id
            if job_type_id not in job_types:
                scheduler_mgr.warning_active(
                    UNKNOWN_JOB_TYPE,
                    description=UNKNOWN_JOB_TYPE.description % job_type_id)
                continue

            workspace_names = job_exe.configuration.get_input_workspace_names()
            workspace_names.extend(
                job_exe.configuration.get_output_workspace_names())

            missing_workspace = False
            for name in workspace_names:
                missing_workspace = missing_workspace or name not in workspaces
            if missing_workspace:
                if jt.name in type_warnings:
                    type_warnings[jt.name]['count'] += 1
                else:
                    type_warnings[jt.name] = {
                        'warning':
                        '%s job types could not be scheduled due to missing workspace'
                        % jt.name,
                        'count':
                        1
                    }
                continue

            # Check limit for this execution's job type
            if job_type_id in job_type_limits and job_type_limits[
                    job_type_id] < 1:
                if jt.name in type_warnings:
                    type_warnings[jt.name]['count'] += 1
                else:
                    type_warnings[jt.name] = {
                        'warning':
                        '%s job types could not be scheduled due to scheduling limit reached'
                        % jt.name,
                        'count':
                        1
                    }
                continue

            # Try to schedule job execution and adjust job type limit if needed
            if self._schedule_new_job_exe(job_exe, nodes, job_type_resources):
                scheduled_job_executions.append(job_exe)
                if job_type_id in job_type_limits:
                    job_type_limits[job_type_id] -= 1

        duration = now() - started
        if type_warnings:
            for warn in type_warnings:
                logger.warning('%d %s', type_warnings[warn]['count'],
                               type_warnings[warn]['warning'])

        msg = 'Processing queue took %.3f seconds'
        if duration > PROCESS_QUEUE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())

        return scheduled_job_executions
Example #24
0
    def test_score_job_exe_for_reservation(self):
        """Tests calling score_job_exe_for_reservation() successfully"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        task = HealthTask(
            '1234', 'agent_1')  # Resources are 0.1 CPUs and 32 MiB memory
        job_exe_1 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]),
            priority=1000)
        job_exe_2 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(56.0), Mem(15.0)]),
            priority=100)
        scheduling_node = SchedulingNode('agent_1', node, [task],
                                         [job_exe_1, job_exe_2], resource_set)
        queue_model_1 = queue_test_utils.create_queue(priority=100,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_1 = QueuedJobExecution(queue_model_1)
        queue_model_2 = queue_test_utils.create_queue(priority=1000,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_2 = QueuedJobExecution(queue_model_2)
        scheduling_node.accept_new_job_exe(job_exe_1)
        scheduling_node.accept_new_job_exe(job_exe_2)

        # We are going to try to reserve the node for a job execution with priority 120
        # Calculate available resources for reservation:
        # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) -  Higher Priority
        # New Job Exes (8, 40) = 135.9 CPUs, 613 memory
        # This new job should fit for reservation
        queue_model = queue_test_utils.create_queue(priority=120,
                                                    cpus_required=130.0,
                                                    mem_required=600.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)
        # Expected available 5.9 CPUs and 13 MiB memory "left" on node
        # (available above - new job we are scoring)
        # First 2 job types should fit, next 2 are too big, so score should be 2
        job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)])
        job_type_resource_2 = NodeResources([Cpus(5.5), Mem(12.0)])
        job_type_resource_3 = NodeResources([Cpus(6.0), Mem(10.0)])
        job_type_resource_4 = NodeResources([Cpus(2.0), Mem(14.0)])

        score = scheduling_node.score_job_exe_for_reservation(
            job_exe, [
                job_type_resource_1, job_type_resource_2, job_type_resource_3,
                job_type_resource_4
            ])
        self.assertEqual(score, 2)