Beispiel #1
0
    def test_handle_successful_pull_task(self):
        """Tests handling the Docker pull task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()

        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule pull task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_image_pulled)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._is_image_pulled)
        # Node should now be ready
        self.assertEqual(node._state, Node.READY)
Beispiel #2
0
    def test_handle_failed_cleanup_task(self):
        """Tests handling failed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Fail task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new cleanup task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_initial_cleanup_completed)

        # After error threshold, we should get new cleanup task
        new_time = when + Node.CLEANUP_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        node._last_heath_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
Beispiel #3
0
    def test_handle_killed_pull_task(self):
        """Tests handling killed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        task_1_id = task.id

        # Kill task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)
Beispiel #4
0
    def test_handle_failed_health_task_low_docker_space(self):
        """Tests handling a failed health task where Docker has low disk space"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with low Docker space exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.LOW_DOCKER_SPACE_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in
                        node._conditions._active_errors)
Beispiel #5
0
    def test_handle_successful_health_task(self):
        """Tests handling the health task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule health task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)
Beispiel #6
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # No task since there are no job executions to clean
        self.assertListEqual([], node.get_next_tasks(when))

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node.add_job_execution(job_exe)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertListEqual([], node.get_next_tasks(when))
Beispiel #7
0
    def test_handle_failed_health_task_bad_logstash(self):
        """Tests handling a failed health task where logstash is unreachable"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with bad logstash exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.BAD_LOGSTASH_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in
                        node._conditions._active_errors)
Beispiel #8
0
    def test_handle_initial_cleanup_task(self):
        """Tests handling the initial cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when

        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(task.is_initial_cleanup)
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule initial cleanup and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_initial_cleanup_completed)

        # Complete initial clean up, verify no new cleanup task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        for task in node.get_next_tasks(when):
            self.assertFalse(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(node._is_initial_cleanup_completed)
Beispiel #9
0
    def test_handle_failed_pull_task(self):
        """Tests handling failed Docker pull task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        node._last_health_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new pull task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_image_pulled)

        # After error threshold, we should get new pull task
        new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta(seconds=5)
        node._last_health_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
Beispiel #10
0
    def test_handle_failed_health_task(self):
        """Tests handling failed health task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in
                        node._conditions._active_errors)

        # No new health task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._conditions.is_health_check_normal)

        # After error threshold, we should get new health task
        new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
Beispiel #11
0
    def test_handle_lost_pull_task(self):
        """Tests handling lost pull task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get same task again
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task with scheduling and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task after running and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)
Beispiel #12
0
    def test_handle_lost_health_task(self):
        """Tests handling lost health task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get different task next time
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)

        # Lose task with scheduling and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)

        # Lose task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)
Beispiel #13
0
    def test_handle_lost_cleanup_tasks(self):
        """Tests handling lost cleanup tasks"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Lose task without scheduling and get different task next time
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)

        # Lose task with scheduling and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)

        # Lose task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)