Example #1
0
    def task_completed(self, task_id, status):
        '''Indicates that a Mesos task for this job execution has completed

        :param task_id: The ID of the task that was completed
        :type task_id: str
        :param status: The task status
        :type status: :class:`mesos_pb2.TaskStatus`
        '''

        if not self.current_task_id == task_id:
            return

        when_completed = EPOCH + timedelta(seconds=status.timestamp)
        exit_code = self._parse_exit_code(status)

        stdout = None
        stderr = None
        try:
            node = self._cached_node
            task_dir = get_slave_task_directory(node.hostname, node.port, self.current_task_id)
            mesos_run_id = get_slave_task_run_id(node.hostname, node.port, self.current_task_id)

            stdout = get_slave_task_file(node.hostname, node.port, task_dir, 'stdout')
            stderr = get_slave_task_file(node.hostname, node.port, task_dir, 'stderr')
        except Exception:
            logger.error('Error getting stdout/stderr for %s', self.current_task_id)

        if self._is_current_task_pre():
            JobExecution.objects.pre_steps_completed(self.job_exe_id, when_completed, exit_code, stdout, stderr)
        elif self._is_current_task_job():
            JobExecution.objects.job_completed(self.job_exe_id, when_completed, exit_code, stdout, stderr, mesos_run_id)
        elif self._is_current_task_post():
            JobExecution.objects.post_steps_completed(self.job_exe_id, when_completed, exit_code, stdout, stderr)

        JobExecution.objects.set_log_urls(self.job_exe_id, None, None)

        # Only successfully completed if there are no more tasks and we never failed along the way
        if not self.remaining_task_ids and not self.failed:
            Queue.objects.handle_job_completion(self.job_exe_id, when_completed)
        self.current_task_id = None
        self.current_task_stdout_url = None
        self.current_task_stderr_url = None
Example #2
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                                    mesos_pb2.TASK_KILLED]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(hostname, port, task_id)
                        results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s', task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr')
                    running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception('Error handling status update for job execution: %s', job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Example #3
0
    def task_failed(self, task_id, status):
        '''Indicates that a Mesos task for this job execution has failed

        :param task_id: The ID of the task that failed
        :type task_id: str
        :param status: The task status
        :type status: :class:`mesos_pb2.TaskStatus`
        '''

        if not self.current_task_id == task_id:
            return

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self.job_exe_id)

        stdout = None
        stderr = None
        node = None
        if status.state != mesos_pb2.TASK_LOST:
            try:
                node = self._cached_node
                task_dir = get_slave_task_directory(node.hostname, node.port, self.current_task_id)
                stdout = get_slave_task_file(node.hostname, node.port, task_dir, 'stdout')
                stderr = get_slave_task_file(node.hostname, node.port, task_dir, 'stderr')
            except Exception:
                logger.error('Error getting stdout/stderr for %s', self.current_task_id)

        self.failed = True
        error = None
        if status.state == mesos_pb2.TASK_LOST:
            error = get_mesos_error()
        if status.state == mesos_pb2.TASK_KILLED and self.timed_out:
            error = get_timeout_error()
        when_failed = EPOCH + timedelta(seconds=status.timestamp)

        exit_code = self._parse_exit_code(status)
        if self._is_current_task_pre():
            # Check scale_pre_steps command to see if exit code maps to a specific error
            if exit_code in PRE_EXIT_CODE_DICT:
                error = PRE_EXIT_CODE_DICT[exit_code]()
            JobExecution.objects.pre_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)
        elif self._is_current_task_job():
            # Do error mapping here to determine error
            error = job_exe.get_error_interface().get_error(exit_code)
            JobExecution.objects.job_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)
        elif self._is_current_task_post():
            # Check scale_post_steps command to see if exit code maps to a specific error
            if exit_code in POST_EXIT_CODE_DICT:
                error = POST_EXIT_CODE_DICT[exit_code]()
            JobExecution.objects.post_steps_failed(self.job_exe_id, when_failed, exit_code, stdout, stderr)

        if not error:
            error = Error.objects.get_unknown_error()
        Queue.objects.handle_job_failure(self.job_exe_id, when_failed, error)

        # Check for a high number of system errors and decide if we should pause the node
        if error.category == 'SYSTEM' and job_exe.job.num_exes >= job_exe.job.max_tries and node is not None and not node.is_paused:
            # search Job.objects. for the number of system failures in the past (configurable) 1 minute
            # if (configurable) 5 or more have occurred, pause the node
            node_error_period = Scheduler.objects.first().node_error_period
            if node_error_period > 0:
                check_time = datetime.utcnow() - timedelta(minutes=node_error_period)
                # find out how many jobs have recently failed on this node with a system error
                num_node_errors = JobExecution.objects.select_related('error', 'node').filter(
                    status='FAILED', error__category='SYSTEM', ended__gte=check_time, node=node).distinct('job').count()
                max_node_errors = Scheduler.objects.first().max_node_errors
                if num_node_errors >= max_node_errors:
                    logger.warning('%s failed %d jobs in %d minutes, pausing the host' % (node.hostname, num_node_errors, node_sleep))
                    with transaction.atomic():
                        node.is_paused = True
                        node.is_paused_errors = True
                        node.pause_reason = "System Failure Rate Too High"
                        node.save()

        # Remove all remaining tasks
        self.remaining_task_ids = []

        self.current_task_id = None
        self.current_task_stdout_url = None
        self.current_task_stderr_url = None
        JobExecution.objects.set_log_urls(self.job_exe_id, None, None)
Example #4
0
    def statusUpdate(self, driver, status):
        '''
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        '''

        started = now()

        task_id = status.task_id.value
        job_exe_id = RunningJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id,
                    utils.status_to_string(status.state))

        # Since we have a status update for this task, remove it from reconciliation set
        self._recon_thread.remove_task_id(task_id)

        try:
            running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id)

            if running_job_exe:
                results = TaskResults(task_id)
                results.exit_code = utils.parse_exit_code(status)
                results.when = utils.get_status_timestamp(status)
                if status.state in [
                        mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR,
                        mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED
                ]:
                    try:
                        log_start_time = now()
                        hostname = running_job_exe._node_hostname
                        port = running_job_exe._node_port
                        task_dir = get_slave_task_directory(
                            hostname, port, task_id)
                        results.stdout = get_slave_task_file(
                            hostname, port, task_dir, 'stdout')
                        results.stderr = get_slave_task_file(
                            hostname, port, task_dir, 'stderr')
                        log_end_time = now()
                        logger.debug('Time to pull logs for task: %s',
                                     str(log_end_time - log_start_time))
                    except Exception:
                        logger.exception('Error pulling logs for task %s',
                                         task_id)

                # Apply status update to running job execution
                if status.state == mesos_pb2.TASK_RUNNING:
                    hostname = running_job_exe._node_hostname
                    port = running_job_exe._node_port
                    task_dir = get_slave_task_directory(
                        hostname, port, task_id)
                    stdout_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stdout')
                    stderr_url = get_slave_task_url(hostname, port, task_dir,
                                                    'stderr')
                    running_job_exe.task_running(task_id, results.when,
                                                 stdout_url, stderr_url)
                elif status.state == mesos_pb2.TASK_FINISHED:
                    running_job_exe.task_complete(results)
                elif status.state == mesos_pb2.TASK_LOST:
                    running_job_exe.task_fail(
                        results, Error.objects.get_builtin_error('mesos-lost'))
                elif status.state in [
                        mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED,
                        mesos_pb2.TASK_KILLED
                ]:
                    running_job_exe.task_fail(results)

                # Remove finished job execution
                if running_job_exe.is_finished():
                    self._job_exe_manager.remove_job_exe(job_exe_id)
            else:
                # Scheduler doesn't have any knowledge of this job execution
                Queue.objects.handle_job_failure(
                    job_exe_id, now(),
                    Error.objects.get_builtin_error('scheduler-lost'))
        except Exception:
            logger.exception(
                'Error handling status update for job execution: %s',
                job_exe_id)
            # Error handling status update, add task so it can be reconciled
            self._recon_thread.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())