Exemple #1
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(
            Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Exemple #2
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Exemple #3
0
    def test_pre_task_launch_error(self):
        """Tests running through a job execution where a pre-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task fails to launch
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = now()
        running_job_exe.task_fail(pre_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Exemple #4
0
    def test_post_task_launch_error(self):
        """Tests running through a job execution where a post-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        running_job_exe.task_start(job_task_id, job_task_started)

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 0
        job_task_results.when = job_task_completed
        running_job_exe.task_complete(job_task_results)

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id

        # Post-task fails to launch
        post_task_results = TaskResults(post_task_id)
        post_task_results.exit_code = 1
        post_task_results.when = now()
        running_job_exe.task_fail(post_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Exemple #5
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        error = error_test_utils.create_error()
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(
            minutes=5
        )  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_failed
        running_job_exe.task_fail(pre_task_results, error)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(error.id, job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Exemple #6
0
    def test_pre_task_launch_error(self):
        """Tests running through a job execution where a pre-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task fails to launch
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.when = now()
        running_job_exe.task_fail(pre_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'task-launch')
Exemple #7
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Exemple #8
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        error = error_test_utils.create_error()
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_failed
        running_job_exe.task_fail(pre_task_results, error)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(error.id, job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Exemple #9
0
    def test_general_algorithm_error(self):
        """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code)
        """

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        running_job_exe.task_start(job_task_id, job_task_started)

        # Fail job-task
        job_task_failed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = job_task_failed
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'algorithm-unknown')
Exemple #10
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Exemple #11
0
    def test_job_task_launch_error(self):
        """Tests running through a job execution where a Docker-based job-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_running(pre_task_id, pre_task_started, '', '')

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task fails to launch
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = now()
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(
            id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Exemple #12
0
    def test_general_algorithm_error(self):
        """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code)
        """

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        running_job_exe.task_start(pre_task_id, pre_task_started)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 0
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        running_job_exe.task_start(job_task_id, job_task_started)

        # Fail job-task
        job_task_failed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 1
        job_task_results.when = job_task_failed
        running_job_exe.task_fail(job_task_results)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'algorithm-unknown')
Exemple #13
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                                    mesos_pb2.TASK_KILLED]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(hostname, port, task_id)
                        results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s', task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr')
                    running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception('Error handling status update for job execution: %s', job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Exemple #14
0
    def test_successful_normal_job_execution(self):
        """Tests running through a normal job execution successfully"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task sets updated command arguments
        updated_commands_args = '-arg updated'
        JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        self.assertEqual(task._command_arguments, updated_commands_args)  # Make sure job task has updated command args
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Job-task running
        job_task_started = pre_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(job_task_id, job_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 2
        job_task_results.when = job_task_completed
        running_job_exe.task_complete(job_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Post-task running
        post_task_started = job_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(post_task_id, post_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete post-task
        post_task_completed = post_task_started + timedelta(seconds=1)
        post_task_results = TaskResults(post_task_id)
        post_task_results.exit_code = 3
        post_task_results.when = post_task_completed
        running_job_exe.task_complete(post_task_results)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_completed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual(job_task_started, job_exe.job_started)
        self.assertEqual(job_task_completed, job_exe.job_completed)
        self.assertEqual(2, job_exe.job_exit_code)
        self.assertEqual(post_task_started, job_exe.post_started)
        self.assertEqual(post_task_completed, job_exe.post_completed)
        self.assertEqual(3, job_exe.post_exit_code)
        self.assertEqual('COMPLETED', job_exe.status)
        self.assertGreater(job_exe.ended, post_task_completed)
Exemple #15
0
    def test_successful_normal_job_execution(self):
        """Tests running through a normal job execution successfully"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(
            self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(
            minutes=5
        )  # Lots of time so now() called at completion is in future
        running_job_exe.task_start(pre_task_id, pre_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task sets updated command arguments
        updated_commands_args = '-arg updated'
        JobExecution.objects.filter(id=self._job_exe_id).update(
            command_arguments=updated_commands_args)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        pre_task_results = TaskResults(pre_task_id)
        pre_task_results.exit_code = 1
        pre_task_results.when = pre_task_completed
        running_job_exe.task_complete(pre_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        self.assertEqual(task._command_arguments, updated_commands_args
                         )  # Make sure job task has updated command args
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Job-task running
        job_task_started = pre_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(job_task_id, job_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        job_task_results = TaskResults(job_task_id)
        job_task_results.exit_code = 2
        job_task_results.when = job_task_completed
        running_job_exe.task_complete(job_task_results)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Post-task running
        post_task_started = job_task_completed + timedelta(seconds=1)
        running_job_exe.task_start(post_task_id, post_task_started)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete post-task
        post_task_completed = post_task_started + timedelta(seconds=1)
        post_task_results = TaskResults(post_task_id)
        post_task_results.exit_code = 3
        post_task_results.when = post_task_completed
        running_job_exe.task_complete(post_task_results)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_completed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual(job_task_started, job_exe.job_started)
        self.assertEqual(job_task_completed, job_exe.job_completed)
        self.assertEqual(2, job_exe.job_exit_code)
        self.assertEqual(post_task_started, job_exe.post_started)
        self.assertEqual(post_task_completed, job_exe.post_completed)
        self.assertEqual(3, job_exe.post_exit_code)
        self.assertEqual('COMPLETED', job_exe.status)
        self.assertGreater(job_exe.ended, post_task_completed)
Exemple #16
0
    def statusUpdate(self, driver, status):
        '''
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        '''

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id,
                    utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [
                        mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR,
                        mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED
                ]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(
                            hostname, port, task_id)
                        results.stdout = get_slave_task_file(
                            hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(
                            hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s',
                                     str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s',
                                         task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(
                        hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stderr')
                    running_job_exe.task_running(task_id, results.when,
                                                 stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(
                        results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [
                        mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                        mesos_pb2.TASK_KILLED
                ]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(
                    job_exe_id, now(),
                    Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception(
                'Error handling status update for job execution: %s',
                job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())