def test_post_task_launch_error(self): """Tests running through a job execution where a post-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() running_job_exe.task_start(job_task_id, job_task_started) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 0 job_task_results.when = job_task_completed running_job_exe.task_complete(job_task_results) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id # Post-task fails to launch post_task_results = TaskResults(post_task_id) post_task_results.exit_code = 1 post_task_results.when = now() running_job_exe.task_fail(post_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_lost_job_execution(self): """Tests running through a job execution that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets lost when_lost = pre_task_completed + timedelta(seconds=1) job_task = running_job_exe.start_next_task() lost_task = running_job_exe.execution_lost(when_lost) self.assertEqual(job_task.id, lost_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual('FAILED', job_exe.status) self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id) self.assertEqual(when_lost, job_exe.ended)
def test_general_algorithm_error(self): """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code) """ # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() running_job_exe.task_start(job_task_id, job_task_started) # Fail job-task job_task_failed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = job_task_failed running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'algorithm-unknown')
def test_job_task_launch_error(self): """Tests running through a job execution where a Docker-based job-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task fails to launch job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = now() running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get( id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) error = error_test_utils.create_error() running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta( minutes=5 ) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_failed running_job_exe.task_fail(pre_task_results, error) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertEqual(error.id, job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_pre_task_launch_error(self): """Tests running through a job execution where a pre-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task fails to launch pre_task_results = TaskResults(pre_task_id) pre_task_results.when = now() running_job_exe.task_fail(pre_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'task-launch')
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) error = error_test_utils.create_error() running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_failed running_job_exe.task_fail(pre_task_results, error) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertEqual(error.id, job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def test_pre_task_launch_error(self): """Tests running through a job execution where a pre-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task fails to launch pre_task_results = TaskResults(pre_task_id) pre_task_results.when = now() running_job_exe.task_fail(pre_task_results) # Check results job_exe = JobExecution.objects.select_related().get( id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def statusUpdate(self, driver, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. """ started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception('Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def test_successful_normal_job_execution(self): """Tests running through a normal job execution successfully""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task sets updated command arguments updated_commands_args = '-arg updated' JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id self.assertEqual(task._command_arguments, updated_commands_args) # Make sure job task has updated command args self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Job-task running job_task_started = pre_task_completed + timedelta(seconds=1) running_job_exe.task_start(job_task_id, job_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 2 job_task_results.when = job_task_completed running_job_exe.task_complete(job_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Post-task running post_task_started = job_task_completed + timedelta(seconds=1) running_job_exe.task_start(post_task_id, post_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete post-task post_task_completed = post_task_started + timedelta(seconds=1) post_task_results = TaskResults(post_task_id) post_task_results.exit_code = 3 post_task_results.when = post_task_completed running_job_exe.task_complete(post_task_results) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_completed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual(job_task_started, job_exe.job_started) self.assertEqual(job_task_completed, job_exe.job_completed) self.assertEqual(2, job_exe.job_exit_code) self.assertEqual(post_task_started, job_exe.post_started) self.assertEqual(post_task_completed, job_exe.post_completed) self.assertEqual(3, job_exe.post_exit_code) self.assertEqual('COMPLETED', job_exe.status) self.assertGreater(job_exe.ended, post_task_completed)
def test_successful_normal_job_execution(self): """Tests running through a normal job execution successfully""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta( minutes=5 ) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task sets updated command arguments updated_commands_args = '-arg updated' JobExecution.objects.filter(id=self._job_exe_id).update( command_arguments=updated_commands_args) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id self.assertEqual(task._command_arguments, updated_commands_args ) # Make sure job task has updated command args self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Job-task running job_task_started = pre_task_completed + timedelta(seconds=1) running_job_exe.task_start(job_task_id, job_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 2 job_task_results.when = job_task_completed running_job_exe.task_complete(job_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Post-task running post_task_started = job_task_completed + timedelta(seconds=1) running_job_exe.task_start(post_task_id, post_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete post-task post_task_completed = post_task_started + timedelta(seconds=1) post_task_results = TaskResults(post_task_id) post_task_results.exit_code = 3 post_task_results.when = post_task_completed running_job_exe.task_complete(post_task_results) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_completed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual(job_task_started, job_exe.job_started) self.assertEqual(job_task_completed, job_exe.job_completed) self.assertEqual(2, job_exe.job_exit_code) self.assertEqual(post_task_started, job_exe.post_started) self.assertEqual(post_task_completed, job_exe.post_completed) self.assertEqual(3, job_exe.post_exit_code) self.assertEqual('COMPLETED', job_exe.status) self.assertGreater(job_exe.ended, post_task_completed)
def statusUpdate(self, driver, status): ''' Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. ''' started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [ mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) results.stdout = get_slave_task_file( hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file( hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail( results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [ mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure( job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception( 'Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())