Esempio n. 1
0
    def test_get_tasks_to_kill(self):
        """Tests calling get_tasks_to_kill() successfully"""

        # Start with 5 tasks
        scheduler_mgr.config.num_message_handlers = 5
        service = MessagingService()
        tasks = service.get_tasks_to_schedule()
        task_mgr.launch_tasks(tasks, now())

        # Lower number of desired tasks to 3, should get 2 to kill
        scheduler_mgr.config.num_message_handlers = 3
        tasks_to_kill = service.get_tasks_to_kill()
        self.assertEqual(len(tasks_to_kill), 2)

        # Kill the 2 tasks
        for task in tasks_to_kill:
            update = job_test_utils.create_task_status_update(
                task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
            task_mgr.handle_task_update(update)
            service.handle_task_update(update)
        self.assertEqual(service.get_actual_task_count(), 3)

        # Increase desired tasks to 10, should not get any to kill
        scheduler_mgr.config.num_message_handlers = 10
        tasks_to_kill = service.get_tasks_to_kill()
        self.assertEqual(len(tasks_to_kill), 0)
Esempio n. 2
0
    def test_handle_task_update(self):
        """Tests calling handle_task_update() successfully"""

        # Start with 5 tasks
        scheduler_mgr.config.num_message_handlers = 5
        service = MessagingService()
        tasks = service.get_tasks_to_schedule()
        task_mgr.launch_tasks(tasks, now())

        # One task fails
        task = tasks[0]
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        task_mgr.handle_task_update(update)
        service.handle_task_update(update)
        self.assertEqual(service.get_actual_task_count(), 4)

        # Should get one new task to schedule
        tasks = service.get_tasks_to_schedule()
        self.assertEqual(len(tasks), 1)
Esempio n. 3
0
    def update(self, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status))
        task_id = task_update.task_id
        was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES
        was_job_finished = False

        if mesos_status == 'TASK_ERROR':
            logger.error('Status update for task %s: %s', task_id, mesos_status)
        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Grab job execution ID from manager
            cluster_id = JobExecution.parse_cluster_id(task_id)
            job_exe = job_exe_mgr.get_running_job_exe(cluster_id)
            if job_exe:
                model.job_exe_id = job_exe.id
        task_update_mgr.add_task_update(model)

        # Update task with latest status
        # This should happen before the job execution or node manager are updated, since they will assume that the task
        # has already been updated
        task_mgr.handle_task_update(task_update)

        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Job task, so update the job execution
            try:
                job_exe = job_exe_mgr.handle_task_update(task_update)
                if job_exe and job_exe.is_finished():
                    logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id)
                    was_job_finished = True
                    cleanup_mgr.add_job_execution(job_exe)
                    GPUManager.release_gpus(job_exe.node_id, job_exe.job_id)

            except Exception:
                cluster_id = JobExecution.parse_cluster_id(task_id)
                logger.exception('Error handling status update for job execution: %s', cluster_id)
                # Error handling status update, add task so it can be reconciled
                task = task_mgr.get_task(task_id)
                if task:
                    recon_mgr.add_tasks([task])
        else:
            # Not a job task, so must be either a node or system task
            node_mgr.handle_task_update(task_update)
            system_task_mgr.handle_task_update(task_update)

        scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished)

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())