Beispiel #1
0
    def test_change_agent_id(self):
        """Tests the NodeManager where a node's agent ID changes"""

        manager = CleanupManager()
        node_1 = Node(self.node_agent_1, self.node_1)
        node_2 = Node(self.node_agent_2, self.node_2)
        manager.update_nodes([node_1, node_2])
        tasks = manager.get_next_tasks()

        task_1 = None
        for task in tasks:
            task.launch(now())
            if task.agent_id == self.node_agent_1:
                task_1 = task

        # Node 1 changes agent ID
        node_1.update_from_mesos(agent_id=self.node_agent_3)
        manager.update_nodes([node_1, node_2])

        # Should get new initial cleanup task for node 1
        tasks = manager.get_next_tasks()
        self.assertEqual(len(tasks), 1)
        new_task_1 = tasks[0]
        self.assertEqual(new_task_1.agent_id, self.node_agent_3)

        # Task update comes back for original node 1 initial cleanup task, manager should ignore with no exception
        update = job_test_utils.create_task_status_update(task_1.id, task_1.agent_id, TaskStatusUpdate.FAILED, now())
        manager.handle_task_update(update)
Beispiel #2
0
    def test_handle_initial_cleanup_task(self):
        """Tests handling the initial cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when

        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(task.is_initial_cleanup)
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule initial cleanup and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_initial_cleanup_completed)

        # Complete initial clean up, verify no new cleanup task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        for task in node.get_next_tasks(when):
            self.assertFalse(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(node._is_initial_cleanup_completed)
Beispiel #3
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node._is_image_pulled = True
        self.node._initial_cleanup_completed()
        self.node._update_state()
        self.paused_node_model = node_test_utils.create_node(
            slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.paused_node_model)
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model)
        self.running_job_exe_2.cpus_scheduled = 2.0
        self.running_job_exe_2.mem_scheduled = 512.0
        self.running_job_exe_2.disk_in_scheduled = 100.0
        self.running_job_exe_2.disk_out_scheduled = 200.0
        self.running_job_exe_2.disk_total_scheduled = 300.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(
            cpus_required=200.0,
            mem_required=1024.0,
            disk_in_required=100.0,
            disk_out_required=200.0,
            disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(
            cpus_required=2.0,
            mem_required=10240.0,
            disk_in_required=100.0,
            disk_out_required=200.0,
            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(
            cpus_required=2.0,
            mem_required=1024.0,
            disk_in_required=10000.0,
            disk_out_required=20000.0,
            disk_total_required=30000.0)
Beispiel #4
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        node = Node(self.node_agent, self.node)
        node.initial_cleanup_completed()
        node_cleanup = NodeCleanup(node)

        # No task since there are no job executions to clean
        self.assertIsNone(node_cleanup.get_next_task())

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node_cleanup.add_job_execution(job_exe)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        task.launch(now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        node_cleanup.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        node_cleanup.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertIsNone(node_cleanup.get_next_task())
Beispiel #5
0
    def test_node_that_is_not_cleaned_yet_no_pull_task(self):
        """Tests not returning pull task when the node hasn't been cleaned up yet"""

        when = now()
        node = Node(self.node_agent, self.node)
        tasks = node.get_next_tasks(when)
        # No pull task due to node not cleaned yet
        for task in tasks:
            self.assertFalse(task.id.startswith(PULL_TASK_ID_PREFIX))
Beispiel #6
0
    def test_paused_node_cleanup_task(self):
        """Tests not returning cleanup task when its node is paused"""

        when = now()
        paused_node = node_test_utils.create_node(hostname='host_1_paused', slave_id='agent_paused')
        paused_node.is_paused = True
        node = Node('agent_paused', paused_node, self.scheduler)
        # Turn off health task
        node._last_health_task = when
        # No task due to paused node
        self.assertListEqual([], node.get_next_tasks(when))
Beispiel #7
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node.initial_cleanup_completed()
        self.paused_node_model = node_test_utils.create_node(slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_2.cpus_scheduled = 4.0
        self.running_job_exe_2.mem_scheduled = 1024.0
        self.running_job_exe_2.disk_in_scheduled = 500.0
        self.running_job_exe_2.disk_out_scheduled = 50.0
        self.running_job_exe_2.disk_total_scheduled = 550.0
        self.running_job_exe_high_cpus = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_cpus.cpus_scheduled = 200.0
        self.running_job_exe_high_cpus.mem_scheduled = 512.0
        self.running_job_exe_high_cpus.disk_in_scheduled = 100.0
        self.running_job_exe_high_cpus.disk_out_scheduled = 200.0
        self.running_job_exe_high_cpus.disk_total_scheduled = 300.0
        self.running_job_exe_high_mem = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_mem.cpus_scheduled = 2.0
        self.running_job_exe_high_mem.mem_scheduled = 1048576.0
        self.running_job_exe_high_mem.disk_in_scheduled = 100.0
        self.running_job_exe_high_mem.disk_out_scheduled = 200.0
        self.running_job_exe_high_mem.disk_total_scheduled = 300.0
        self.running_job_exe_high_disk = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_disk.cpus_scheduled = 2.0
        self.running_job_exe_high_disk.mem_scheduled = 512.0
        self.running_job_exe_high_disk.disk_in_scheduled = 10000.0
        self.running_job_exe_high_disk.disk_out_scheduled = 20000.0
        self.running_job_exe_high_disk.disk_total_scheduled = 30000.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(cpus_required=200.0, mem_required=1024.0,
                                                             disk_in_required=100.0, disk_out_required=200.0,
                                                             disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(cpus_required=2.0, mem_required=10240.0,
                                                            disk_in_required=100.0, disk_out_required=200.0,
                                                            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0,
                                                             disk_in_required=10000.0, disk_out_required=20000.0,
                                                             disk_total_required=30000.0)
Beispiel #8
0
    def test_paused_node_pull_task(self):
        """Tests not returning pull task when its node is paused"""

        when = now()
        paused_node = node_test_utils.create_node(hostname='host_1_paused', slave_id='agent_paused')
        paused_node.is_paused = True
        node = Node('agent_paused', paused_node, self.scheduler)
        node._last_health_task = when
        node._initial_cleanup_completed()
        node._update_state()
        tasks = node.get_next_tasks(when)
        # No task due to paused node
        self.assertListEqual([], tasks)
Beispiel #9
0
    def test_generate_status_json(self, mock_now):
        """Tests calling generate_status_json() successfully"""

        right_now = now()
        mock_now.return_value = right_now
        num_job_exes = JOB_EXES_WARNING_THRESHOLD + 1

        node = Node(self.node_agent, self.node, self.scheduler)
        node._conditions.handle_pull_task_failed()
        node._conditions.update_cleanup_count(num_job_exes)
        node._update_state()
        nodes_list = []
        node.generate_status_json(nodes_list)

        expected_results = [{'id': node.id, 'hostname': node.hostname, 'agent_id': self.node_agent, 'is_active': True,
                             'state': {'name': 'DEGRADED', 'title': Node.DEGRADED.title,
                                       'description': Node.DEGRADED.description},
                             'errors': [{'name': 'IMAGE_PULL', 'title': NodeConditions.IMAGE_PULL_ERR.title,
                                         'description': NodeConditions.IMAGE_PULL_ERR.description,
                                         'started': datetime_to_string(right_now),
                                         'last_updated': datetime_to_string(right_now)}],
                             'warnings': [{'name': 'CLEANUP', 'title': NodeConditions.CLEANUP_WARNING.title,
                                           'description': NodeConditions.CLEANUP_WARNING.description % num_job_exes,
                                           'started': datetime_to_string(right_now),
                                           'last_updated': datetime_to_string(right_now)}]}]
        self.assertListEqual(nodes_list, expected_results)
Beispiel #10
0
    def test_handle_killed_cleanup_task(self):
        """Tests handling killed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Kill task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)
Beispiel #11
0
    def test_handle_killed_health_task(self):
        """Tests handling killed health task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        task_1_id = task.id

        # Kill task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)
Beispiel #12
0
    def test_handle_successful_health_task(self):
        """Tests handling the health task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule health task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)
Beispiel #13
0
    def test_handle_failed_health_task_low_docker_space(self):
        """Tests handling a failed health task where Docker has low disk space"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with low Docker space exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.LOW_DOCKER_SPACE_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in
                        node._conditions._active_errors)
Beispiel #14
0
    def test_handle_failed_health_task_bad_logstash(self):
        """Tests handling a failed health task where logstash is unreachable"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with bad logstash exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.BAD_LOGSTASH_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in
                        node._conditions._active_errors)
Beispiel #15
0
    def test_handle_failed_health_task(self):
        """Tests handling failed health task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in
                        node._conditions._active_errors)

        # No new health task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._conditions.is_health_check_normal)

        # After error threshold, we should get new health task
        new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
Beispiel #16
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # No task since there are no job executions to clean
        self.assertListEqual([], node.get_next_tasks(when))

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node.add_job_execution(job_exe)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertListEqual([], node.get_next_tasks(when))
Beispiel #17
0
    def test_handle_successful_pull_task(self):
        """Tests handling the Docker pull task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()

        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule pull task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_image_pulled)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._is_image_pulled)
        # Node should now be ready
        self.assertEqual(node._state, Node.READY)
Beispiel #18
0
    def test_handle_failed_pull_task(self):
        """Tests handling failed Docker pull task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        node._last_health_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new pull task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_image_pulled)

        # After error threshold, we should get new pull task
        new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        node._last_health_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
Beispiel #19
0
class TestNodeOffers(TestCase):

    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node._is_image_pulled = True
        self.node._initial_cleanup_completed()
        self.node._update_state()
        self.paused_node_model = node_test_utils.create_node(slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_2.cpus_scheduled = 4.0
        self.running_job_exe_2.mem_scheduled = 1024.0
        self.running_job_exe_2.disk_in_scheduled = 500.0
        self.running_job_exe_2.disk_out_scheduled = 50.0
        self.running_job_exe_2.disk_total_scheduled = 550.0
        self.running_job_exe_high_cpus = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_cpus.cpus_scheduled = 200.0
        self.running_job_exe_high_cpus.mem_scheduled = 512.0
        self.running_job_exe_high_cpus.disk_in_scheduled = 100.0
        self.running_job_exe_high_cpus.disk_out_scheduled = 200.0
        self.running_job_exe_high_cpus.disk_total_scheduled = 300.0
        self.running_job_exe_high_mem = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_mem.cpus_scheduled = 2.0
        self.running_job_exe_high_mem.mem_scheduled = 1048576.0
        self.running_job_exe_high_mem.disk_in_scheduled = 100.0
        self.running_job_exe_high_mem.disk_out_scheduled = 200.0
        self.running_job_exe_high_mem.disk_total_scheduled = 300.0
        self.running_job_exe_high_disk = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_disk.cpus_scheduled = 2.0
        self.running_job_exe_high_disk.mem_scheduled = 512.0
        self.running_job_exe_high_disk.disk_in_scheduled = 10000.0
        self.running_job_exe_high_disk.disk_out_scheduled = 20000.0
        self.running_job_exe_high_disk.disk_total_scheduled = 30000.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(cpus_required=200.0, mem_required=1024.0,
                                                             disk_in_required=100.0, disk_out_required=200.0,
                                                             disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(cpus_required=2.0, mem_required=10240.0,
                                                            disk_in_required=100.0, disk_out_required=200.0,
                                                            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0,
                                                             disk_in_required=10000.0, disk_out_required=20000.0,
                                                             disk_total_required=30000.0)

    def test_adding_offers(self):
        """Tests adding offer and checking the results"""

        node_offers = NodeOffers(self.node)

        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        node_offers.add_offer(offer_1)  # Add same offer twice, should ignore

        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=5.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        offer_3 = ResourceOffer('offer_3',  self.node_agent, NodeResources(cpus=3.0, mem=512.0, disk=1024.0))
        node_offers.add_offer(offer_3)

        # Does not get added into total
        offer_4 = ResourceOffer('offer_4', 'bad_agent', NodeResources(cpus=1.0, mem=512.0, disk=1024.0))
        node_offers.add_offer(offer_4)

        self.assertEqual(node_offers._available_cpus, 10.0)
        self.assertEqual(node_offers._available_mem, 3584.0)
        self.assertEqual(node_offers._available_disk, 4096.0)

    def test_consider_new_job_exe(self):
        """Tests consider_new_job_exe() and get_accepted_new_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_new_job_exe(job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = QueuedJobExecution(self.queue_high_cpus)
        result = node_offers.consider_new_job_exe(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = QueuedJobExecution(self.queue_high_mem)
        result = node_offers.consider_new_job_exe(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = QueuedJobExecution(self.queue_high_disk)
        result = node_offers.consider_new_job_exe(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = QueuedJobExecution(self.queue_2)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_new_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_new_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 64.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2327.0)

    def test_consider_next_task(self):
        """Tests consider_next_task() and get_accepted_running_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_next_task(job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus)
        result = node_offers.consider_next_task(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem)
        result = node_offers.consider_next_task(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk)
        result = node_offers.consider_next_task(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)

    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_NOT_READY)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)

    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)

    def test_no_offers(self):
        """Tests adding job executions when there are no offers"""

        node_offers = NodeOffers(self.node)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

    def test_remove_offer(self):
        """Tests remove_offer()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Remove one offer, new job execution should still be accepted
        node_offers.remove_offer(offer_1.id)
        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [job_exe_1])

        # Remove second offer, no resources left, all job executions should be removed
        node_offers.remove_offer(offer_2.id)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Beispiel #20
0
class TestOfferManager(TestCase):
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node._is_image_pulled = True
        self.node._initial_cleanup_completed()
        self.node._update_state()
        self.paused_node_model = node_test_utils.create_node(
            slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.paused_node_model)
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model)
        self.running_job_exe_2.cpus_scheduled = 2.0
        self.running_job_exe_2.mem_scheduled = 512.0
        self.running_job_exe_2.disk_in_scheduled = 100.0
        self.running_job_exe_2.disk_out_scheduled = 200.0
        self.running_job_exe_2.disk_total_scheduled = 300.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(
            cpus_required=200.0,
            mem_required=1024.0,
            disk_in_required=100.0,
            disk_out_required=200.0,
            disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(
            cpus_required=2.0,
            mem_required=10240.0,
            disk_in_required=100.0,
            disk_out_required=200.0,
            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(
            cpus_required=2.0,
            mem_required=1024.0,
            disk_in_required=10000.0,
            disk_out_required=20000.0,
            disk_total_required=30000.0)

    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_NOT_READY)

    def test_offers_with_no_nodes(self):
        """Tests considering job executions when offers cannot be readied due to no nodes updated"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_NOT_READY)

    def test_accepted(self):
        """Tests accepting a running and queued job execution and returning the node offers"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 2)

    def test_remove_offers(self):
        """Tests accepting a running and queued job execution and then removing all offers"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.remove_offers([offer_2.id, offer_1.id])
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_lost_node_that_comes_back(self):
        """Tests that when a lost name comes back, it can schedule tasks again"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # Node goes down and comes back up with new agent ID
        manager.lost_node(self.node_agent)
        new_node_agent = 'i_am_a_new_node_agent'
        self.node.update_from_mesos(agent_id=new_node_agent)

        job_exe_1 = QueuedJobExecution(self.queue_1)

        # Offers for previous agent should be gone, do not schedule the job exe
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        offer_3 = ResourceOffer(
            'offer_3', new_node_agent,
            NodeResources(cpus=35.0, mem=3048.0, disk=3048.0))
        manager.add_new_offers([offer_3])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # New offers have come in for new agent ID, should schedule job exe now
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 1)

    def test_all_offers_paused(self):
        """Tests rejecting a queued job execution due to all nodes being paused"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_paused,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_cpus(self):
        """Tests rejecting a queued job execution due to too many CPUs required"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_cpus)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_CPUS)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_mem(self):
        """Tests rejecting a queued job execution due to too much memory required"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_mem)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_MEM)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_disk(self):
        """Tests rejecting a queued job execution due to too much disk required"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_disk)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_DISK)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Beispiel #21
0
    def test_handle_lost_pull_task(self):
        """Tests handling lost pull task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get same task again
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task with scheduling and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task after running and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)
Beispiel #22
0
class TestNodeOffers(TestCase):

    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node.initial_cleanup_completed()
        self.paused_node_model = node_test_utils.create_node(slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_2.cpus_scheduled = 4.0
        self.running_job_exe_2.mem_scheduled = 1024.0
        self.running_job_exe_2.disk_in_scheduled = 500.0
        self.running_job_exe_2.disk_out_scheduled = 50.0
        self.running_job_exe_2.disk_total_scheduled = 550.0
        self.running_job_exe_high_cpus = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_cpus.cpus_scheduled = 200.0
        self.running_job_exe_high_cpus.mem_scheduled = 512.0
        self.running_job_exe_high_cpus.disk_in_scheduled = 100.0
        self.running_job_exe_high_cpus.disk_out_scheduled = 200.0
        self.running_job_exe_high_cpus.disk_total_scheduled = 300.0
        self.running_job_exe_high_mem = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_mem.cpus_scheduled = 2.0
        self.running_job_exe_high_mem.mem_scheduled = 1048576.0
        self.running_job_exe_high_mem.disk_in_scheduled = 100.0
        self.running_job_exe_high_mem.disk_out_scheduled = 200.0
        self.running_job_exe_high_mem.disk_total_scheduled = 300.0
        self.running_job_exe_high_disk = job_test_utils.create_job_exe(status='RUNNING')
        self.running_job_exe_high_disk.cpus_scheduled = 2.0
        self.running_job_exe_high_disk.mem_scheduled = 512.0
        self.running_job_exe_high_disk.disk_in_scheduled = 10000.0
        self.running_job_exe_high_disk.disk_out_scheduled = 20000.0
        self.running_job_exe_high_disk.disk_total_scheduled = 30000.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(cpus_required=200.0, mem_required=1024.0,
                                                             disk_in_required=100.0, disk_out_required=200.0,
                                                             disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(cpus_required=2.0, mem_required=10240.0,
                                                            disk_in_required=100.0, disk_out_required=200.0,
                                                            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0,
                                                             disk_in_required=10000.0, disk_out_required=20000.0,
                                                             disk_total_required=30000.0)

    def test_adding_offers(self):
        """Tests adding offer and checking the results"""

        node_offers = NodeOffers(self.node)

        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        node_offers.add_offer(offer_1)  # Add same offer twice, should ignore

        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=5.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        offer_3 = ResourceOffer('offer_3',  self.node_agent, NodeResources(cpus=3.0, mem=512.0, disk=1024.0))
        node_offers.add_offer(offer_3)

        # Does not get added into total
        offer_4 = ResourceOffer('offer_4', 'bad_agent', NodeResources(cpus=1.0, mem=512.0, disk=1024.0))
        node_offers.add_offer(offer_4)

        self.assertEqual(node_offers._available_cpus, 10.0)
        self.assertEqual(node_offers._available_mem, 3584.0)
        self.assertEqual(node_offers._available_disk, 4096.0)

    def test_consider_new_job_exe(self):
        """Tests consider_new_job_exe() and get_accepted_new_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_new_job_exe(job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = QueuedJobExecution(self.queue_high_cpus)
        result = node_offers.consider_new_job_exe(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = QueuedJobExecution(self.queue_high_mem)
        result = node_offers.consider_new_job_exe(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = QueuedJobExecution(self.queue_high_disk)
        result = node_offers.consider_new_job_exe(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = QueuedJobExecution(self.queue_2)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_new_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_new_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 64.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2327.0)

    def test_consider_next_task(self):
        """Tests consider_next_task() and get_accepted_running_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_next_task(job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus)
        result = node_offers.consider_next_task(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem)
        result = node_offers.consider_next_task(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk)
        result = node_offers.consider_next_task(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)

    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_PAUSED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)

    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)

    def test_no_offers(self):
        """Tests adding job executions when there are no offers"""

        node_offers = NodeOffers(self.node)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

    def test_remove_offer(self):
        """Tests remove_offer()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Remove one offer, new job execution should still be accepted
        node_offers.remove_offer(offer_1.id)
        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [job_exe_1])

        # Remove second offer, no resources left, all job executions should be removed
        node_offers.remove_offer(offer_2.id)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Beispiel #23
0
    def test_handle_failed_cleanup_task(self):
        """Tests handling failed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Fail task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new cleanup task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_initial_cleanup_completed)

        # After error threshold, we should get new cleanup task
        new_time = when + Node.CLEANUP_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        node._last_heath_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
Beispiel #24
0
class TestOfferManager(TestCase):

    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()

        self.node_agent = 'agent_1'
        self.node_agent_paused = 'agent_paused'
        self.node_model = node_test_utils.create_node(slave_id=self.node_agent)
        self.node = Node(self.node_agent, self.node_model)
        self.node.initial_cleanup_completed()
        self.paused_node_model = node_test_utils.create_node(slave_id=self.node_agent_paused)
        self.paused_node_model.is_paused = True
        self.paused_node = Node(self.node_agent_paused, self.paused_node_model)

        self.running_job_exe_1 = job_test_utils.create_job_exe(status='RUNNING', node=self.paused_node_model)
        self.running_job_exe_1.cpus_scheduled = 2.0
        self.running_job_exe_1.mem_scheduled = 512.0
        self.running_job_exe_1.disk_in_scheduled = 100.0
        self.running_job_exe_1.disk_out_scheduled = 200.0
        self.running_job_exe_1.disk_total_scheduled = 300.0
        self.running_job_exe_2 = job_test_utils.create_job_exe(status='RUNNING', node=self.node_model)
        self.running_job_exe_2.cpus_scheduled = 2.0
        self.running_job_exe_2.mem_scheduled = 512.0
        self.running_job_exe_2.disk_in_scheduled = 100.0
        self.running_job_exe_2.disk_out_scheduled = 200.0
        self.running_job_exe_2.disk_total_scheduled = 300.0

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        self.queue_high_cpus = queue_test_utils.create_queue(cpus_required=200.0, mem_required=1024.0,
                                                             disk_in_required=100.0, disk_out_required=200.0,
                                                             disk_total_required=300.0)
        self.queue_high_mem = queue_test_utils.create_queue(cpus_required=2.0, mem_required=10240.0,
                                                            disk_in_required=100.0, disk_out_required=200.0,
                                                            disk_total_required=300.0)
        self.queue_high_disk = queue_test_utils.create_queue(cpus_required=2.0, mem_required=1024.0,
                                                             disk_in_required=10000.0, disk_out_required=20000.0,
                                                             disk_total_required=30000.0)

    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_OFFLINE)

    def test_offers_with_no_nodes(self):
        """Tests considering job executions when offers cannot be readied due to no nodes updated"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_OFFLINE)

    def test_accepted(self):
        """Tests accepting a running and queued job execution and returning the node offers"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 2)

    def test_remove_offers(self):
        """Tests accepting a running and queued job execution and then removing all offers"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.remove_offers([offer_2.id, offer_1.id])
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_lost_node_that_comes_back(self):
        """Tests that when a lost name comes back, it can schedule tasks again"""

        offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # Node goes down and comes back up with new agent ID
        manager.lost_node(self.node_agent)
        new_node_agent = 'i_am_a_new_node_agent'
        self.node.update_from_mesos(agent_id=new_node_agent)

        job_exe_1 = QueuedJobExecution(self.queue_1)

        # Offers for previous agent should be gone, do not schedule the job exe
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        offer_3 = ResourceOffer('offer_3', new_node_agent, NodeResources(cpus=35.0, mem=3048.0, disk=3048.0))
        manager.add_new_offers([offer_3])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # New offers have come in for new agent ID, should schedule job exe now
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 1)

    def test_all_offers_paused(self):
        """Tests rejecting a queued job execution due to all nodes being paused"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_cpus(self):
        """Tests rejecting a queued job execution due to too many CPUs required"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_cpus)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_CPUS)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_mem(self):
        """Tests rejecting a queued job execution due to too much memory required"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_mem)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_MEM)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)

    def test_high_disk(self):
        """Tests rejecting a queued job execution due to too much disk required"""

        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_disk)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_DISK)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)