def handle_task_update(self, task_update): """Handles the given task update and returns the associated job execution if it has finished :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` :returns: The job execution if it has finished, None otherwise :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ finished_job_exe = None if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task_update.task_id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] job_exe.task_update(task_update) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exe = job_exe # return job_exe # TODO: this can be removed once database operations move to messaging backend if finished_job_exe: self._handle_finished_job_exe_in_database(finished_job_exe) return finished_job_exe return None
def handle_task_timeout(self, task, when): """Handles the timeout of the given task :param task: The task :type task: :class:`job.tasks.base_task.Task` :param when: The time that the time out occurred :type when: :class:`datetime.datetime` """ if task.id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task.id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] # We do not remove the failed job execution at this point. We wait for the status update of the # killed task to come back so that job execution cleanup occurs after the task is dead. job_exe.execution_timed_out(task, when)
def create_task_update_model(status): """Creates and returns a task update model for the given Mesos task status :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` :returns: The task update model :rtype: :class:`job.models.TaskUpdate` """ task_update = TaskUpdate() task_update.task_id = get_status_task_id(status) if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): task_update.job_exe_id = JobExecution.get_job_exe_id(task_update.task_id) task_update.status = get_status_state(status) task_update.timestamp = get_status_timestamp(status) task_update.source = get_status_source(status) task_update.reason = get_status_reason(status) task_update.message = get_status_message(status) return task_update
def handle_task_update(self, task_update): """Handles the given task update and returns the associated job execution if it has finished :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` :returns: The job execution if it has finished, None otherwise :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task_update.task_id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] job_exe.task_update(task_update) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) return job_exe return None
def handle_task_timeout(self, task, when): """Handles the timeout of the given task :param task: The task :type task: :class:`job.tasks.base_task.Task` :param when: The time that the time out occurred :type when: :class:`datetime.datetime` """ if task.id.startswith(JOB_TASK_ID_PREFIX): job_exe_id = JobExecution.get_job_exe_id(task.id) with self._lock: if job_exe_id in self._running_job_exes: job_exe = self._running_job_exes[job_exe_id] try: job_exe.execution_timed_out(task, when) except DatabaseError: logger.exception( 'Error failing timed out job execution %i', job_exe_id)
def create_job_exe(job_type=None, job=None, exe_num=None, node=None, timeout=None, input_file_size=10.0, queued=None, started=None, status='RUNNING', error=None, ended=None, output=None, task_results=None): """Creates a job_exe model for unit testing, may also create job_exe_end and job_exe_output models depending on status :returns: The job_exe model :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ when = timezone.now() if not job: job = create_job(job_type=job_type, status=status, input_file_size=input_file_size) job_type = job.job_type job_exe = JobExecution() job_exe.job = job job_exe.job_type = job_type if not exe_num: exe_num = job.num_exes job_exe.exe_num = exe_num job_exe.set_cluster_id('1234', job.id, job_exe.exe_num) if not node: node = node_utils.create_node() job_exe.node = node if not timeout: timeout = job.timeout job_exe.timeout = timeout job_exe.input_file_size = input_file_size job_exe.resources = job.get_resources().get_json().get_dict() job_exe.configuration = ExecutionConfiguration().get_dict() if not queued: queued = when job_exe.queued = queued if not started: started = when + datetime.timedelta(seconds=1) job_exe.started = started job_exe.save() if status in ['COMPLETED', 'FAILED', 'CANCELED']: job_exe_end = JobExecutionEnd() job_exe_end.job_exe_id = job_exe.id job_exe_end.job = job_exe.job job_exe_end.job_type = job_exe.job_type job_exe_end.exe_num = job_exe.exe_num if not task_results: task_results = TaskResults() job_exe_end.task_results = task_results.get_dict() job_exe_end.status = status if status == 'FAILED' and not error: error = error_test_utils.create_error() job_exe_end.error = error job_exe_end.node = node job_exe_end.queued = queued job_exe_end.started = started job_exe_end.seed_started = task_results.get_task_started('main') job_exe_end.seed_ended = task_results.get_task_ended('main') if not ended: ended = started + datetime.timedelta(seconds=1) job_exe_end.ended = ended job_exe_end.save() if status == 'COMPLETED' or output: job_exe_output = JobExecutionOutput() job_exe_output.job_exe_id = job_exe.id job_exe_output.job = job_exe.job job_exe_output.job_type = job_exe.job_type job_exe_output.exe_num = job_exe.exe_num if not output: output = JobResults() job_exe_output.output = output.get_dict() job_exe_output.save() return job_exe
def test_append_stderr_none(self): '''Tests skipping append when no error is provided.''' job_exe = JobExecution(stderr='initial') job_exe.append_stderr(None) self.assertEqual(job_exe.stderr, 'initial')
def test_append_stderr_join(self): '''Tests appending error to existing error.''' job_exe = JobExecution(stderr='initial') job_exe.append_stderr('-test1') self.assertEqual(job_exe.stderr, 'initial-test1')
def test_append_stderr_initial(self): '''Tests appending error for the first time.''' job_exe = JobExecution() job_exe.append_stderr(stderr='initial') self.assertEqual(job_exe.stderr, 'initial')
def test_append_stdout_none(self): '''Tests skipping append when no output is provided.''' job_exe = JobExecution(stdout='initial') job_exe.append_stdout(None) self.assertEqual(job_exe.stdout, 'initial')
def create_job_exe_model(self, framework_id, when): """Creates and returns a scheduled job execution model :param framework_id: The scheduling framework ID :type framework_id: string :param when: The start time :type when: :class:`datetime.datetime` :returns: The job execution model :rtype: :class:`job.models.JobExecution` """ job_exe = JobExecution() job_exe.job_id = self._queue.job_id job_exe.job_type_id = self._queue.job_type_id job_exe.recipe_id = self._queue.recipe_id job_exe.batch_id = self._queue.batch_id job_exe.exe_num = self._queue.exe_num job_exe.timeout = self._queue.timeout job_exe.docker_image = self._queue.docker_image job_exe.input_file_size = self._queue.input_file_size job_exe.configuration = self.configuration.get_dict() job_exe.queued = self._queue.queued if self.is_canceled: job_exe.node_id = None job_exe.resources = NodeResources().get_json().get_dict() job_exe.started = None else: job_exe.node_id = self._scheduled_node_id job_exe.resources = self._scheduled_resources.get_json().get_dict() job_exe.started = when job_exe.set_cluster_id(framework_id, self._queue.job_id, self._queue.exe_num) if self.required_resources.gpus > 0: if not GPUManager.assign_gpus_for_job(job_exe.node_id,job_exe.job_id, self.required_resources.gpus): logger.error("Job %s was unable to assign %s reserved GPUs on node %s. Note: this is not supposed to be able to happen. something has gone horribly wrong.", job_exe.job_id, self.required_resources.gpus, job_exe.node_id) return job_exe
def test_append_stdout_initial(self): '''Tests appending output for the first time.''' job_exe = JobExecution() job_exe.append_stdout(stdout='initial') self.assertEqual(job_exe.stdout, 'initial')
def update(self, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. """ started = now() model = utils.create_task_update_model(status) mesos_status = model.status task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status)) task_id = task_update.task_id was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES was_job_finished = False if mesos_status == 'TASK_ERROR': logger.error('Status update for task %s: %s', task_id, mesos_status) if mesos_status == 'TASK_LOST': logger.warning('Status update for task %s: %s', task_id, mesos_status) else: logger.info('Status update for task %s: %s', task_id, mesos_status) # Since we have a status update for this task, remove it from reconciliation set recon_mgr.remove_task_id(task_id) # Hand off task update to be saved in the database if task_id.startswith(JOB_TASK_ID_PREFIX): # Grab job execution ID from manager cluster_id = JobExecution.parse_cluster_id(task_id) job_exe = job_exe_mgr.get_running_job_exe(cluster_id) if job_exe: model.job_exe_id = job_exe.id task_update_mgr.add_task_update(model) # Update task with latest status # This should happen before the job execution or node manager are updated, since they will assume that the task # has already been updated task_mgr.handle_task_update(task_update) if task_id.startswith(JOB_TASK_ID_PREFIX): # Job task, so update the job execution try: job_exe = job_exe_mgr.handle_task_update(task_update) if job_exe and job_exe.is_finished(): logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id) was_job_finished = True cleanup_mgr.add_job_execution(job_exe) GPUManager.release_gpus(job_exe.node_id, job_exe.job_id) except Exception: cluster_id = JobExecution.parse_cluster_id(task_id) logger.exception('Error handling status update for job execution: %s', cluster_id) # Error handling status update, add task so it can be reconciled task = task_mgr.get_task(task_id) if task: recon_mgr.add_tasks([task]) else: # Not a job task, so must be either a node or system task node_mgr.handle_task_update(task_update) system_task_mgr.handle_task_update(task_update) scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def statusUpdate(self, driver, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. """ started = now() model = utils.create_task_update_model(status) mesos_status = model.status task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status)) task_id = task_update.task_id if mesos_status == 'TASK_LOST': logger.warning('Status update for task %s: %s', task_id, mesos_status) else: logger.info('Status update for task %s: %s', task_id, mesos_status) # Since we have a status update for this task, remove it from reconciliation set recon_mgr.remove_task_id(task_id) # Hand off task update to be saved in the database task_update_mgr.add_task_update(model) if task_id.startswith(CLEANUP_TASK_ID_PREFIX): cleanup_mgr.handle_task_update(task_update) else: job_exe_id = JobExecution.get_job_exe_id(task_id) try: running_job_exe = running_job_mgr.get_job_exe(job_exe_id) if running_job_exe: running_job_exe.task_update(task_update) # Remove finished job execution if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(job_exe_id) cleanup_mgr.add_job_execution(running_job_exe) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure(job_exe_id, now(), [], Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception('Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled recon_mgr.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def create_running_job_exe(agent_id='agent_1', job_type=None, job=None, node=None, timeout=None, input_file_size=10.0, queued=None, started=None, resources=None, priority=None, num_exes=1): """Creates a running job execution for unit testing :returns: The running job execution :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ when = timezone.now() if not job: job = create_job(job_type=job_type, status='RUNNING', input_file_size=input_file_size, num_exes=num_exes) job_type = job.job_type # Configuration that occurs at queue time input_files = {} input_file_ids = job.get_job_data().get_input_file_ids() if input_file_ids: for input_file in ScaleFile.objects.get_files_for_queued_jobs( input_file_ids): input_files[input_file.id] = input_file exe_config = QueuedExecutionConfigurator(input_files).configure_queued_job( job) job_exe = JobExecution() job_exe.set_cluster_id('1234', job.id, job.num_exes) job_exe.job = job job_exe.job_type = job_type job_exe.exe_num = job.num_exes if not node: node = node_utils.create_node() job_exe.node = node if not timeout: timeout = job.timeout job_exe.timeout = timeout job_exe.input_file_size = input_file_size if not resources: resources = job.get_resources() job_exe.resources = resources.get_json().get_dict() job_exe.configuration = exe_config.get_dict() if not queued: queued = when job_exe.queued = queued if not started: started = when + datetime.timedelta(seconds=1) job_exe.started = started job_exe.save() if not priority: priority = job.priority # Configuration that occurs at schedule time workspaces = {} for workspace in Workspace.objects.all(): workspaces[workspace.name] = workspace secret_config = ScheduledExecutionConfigurator( workspaces).configure_scheduled_job(job_exe, job_type, job_type.get_job_interface(), 'INFO') return RunningJobExecution(agent_id, job_exe, job_type, secret_config, priority)
def create_job_exe_model(self, framework_id, when): """Creates and returns a scheduled job execution model :param framework_id: The scheduling framework ID :type framework_id: string :param when: The start time :type when: :class:`datetime.datetime` :returns: The job execution model :rtype: :class:`job.models.JobExecution` """ job_exe = JobExecution() job_exe.job_id = self._queue.job_id job_exe.job_type_id = self._queue.job_type_id job_exe.recipe_id = self._queue.recipe_id job_exe.batch_id = self._queue.batch_id job_exe.exe_num = self._queue.exe_num job_exe.timeout = self._queue.timeout job_exe.input_file_size = self._queue.input_file_size job_exe.configuration = self.configuration.get_dict() job_exe.queued = self._queue.queued if self.is_canceled: job_exe.node_id = None job_exe.resources = NodeResources().get_json().get_dict() job_exe.started = None else: job_exe.node_id = self._scheduled_node_id job_exe.resources = self._scheduled_resources.get_json().get_dict() job_exe.started = when job_exe.set_cluster_id(framework_id, self._queue.job_id, self._queue.exe_num) return job_exe
def test_append_stdout_join(self): '''Tests appending output to existing output.''' job_exe = JobExecution(stdout='initial') job_exe.append_stdout('-test1') self.assertEqual(job_exe.stdout, 'initial-test1')