Ejemplo n.º 1
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
        task_update.job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Ejemplo n.º 2
0
    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None
Ejemplo n.º 3
0
    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task.id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    try:
                        job_exe.execution_timed_out(task, when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing timed out job execution %i',
                            job_exe_id)
Ejemplo n.º 4
0
def create_task_update_model(status):
    """Creates and returns a task update model for the given Mesos task status

    :param status: The task status
    :type status: :class:`mesos_pb2.TaskStatus`
    :returns: The task update model
    :rtype: :class:`job.models.TaskUpdate`
    """

    task_update = TaskUpdate()
    task_update.task_id = get_status_task_id(status)
    if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
        task_update.job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
    task_update.status = get_status_state(status)
    task_update.timestamp = get_status_timestamp(status)
    task_update.source = get_status_source(status)
    task_update.reason = get_status_reason(status)
    task_update.message = get_status_message(status)

    return task_update
Ejemplo n.º 5
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status))
        task_id = task_update.task_id

        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        task_update_mgr.add_task_update(model)

        if task_id.startswith(CLEANUP_TASK_ID_PREFIX):
            cleanup_mgr.handle_task_update(task_update)
        else:
            job_exe_id = JobExecution.get_job_exe_id(task_id)

            try:
                running_job_exe = running_job_mgr.get_job_exe(job_exe_id)

                if running_job_exe:
                    running_job_exe.task_update(task_update)

                    # Remove finished job execution
                    if running_job_exe.is_finished():
                        running_job_mgr.remove_job_exe(job_exe_id)
                        cleanup_mgr.add_job_execution(running_job_exe)
                else:
                    # Scheduler doesn't have any knowledge of this job execution
                    Queue.objects.handle_job_failure(job_exe_id, now(), [],
                                                     Error.objects.get_builtin_error('scheduler-lost'))
            except Exception:
                logger.exception('Error handling status update for job execution: %s', job_exe_id)
                # Error handling status update, add task so it can be reconciled
                recon_mgr.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())