Beispiel #1
0
    def _reconcile_running_jobs(self):
        '''Looks up all currently running jobs and adds them to the set so that they can be reconciled
        '''

        # List of task IDs to reconcile
        task_id_list = []

        # Query for jobs that are in RUNNING status
        job_exes = JobExecution.objects.get_running_job_exes()

        # Look through scheduler data and find current task ID for each
        # RUNNING job
        for job_exe in job_exes:
            scale_job_exe = self._get_job_exe(job_exe.id)
            if scale_job_exe:
                task_id = scale_job_exe.current_task()
                if task_id:
                    task_id_list.append(task_id)
            else:
                # Fail any jobs that the scheduler doesn't know about
                error = get_scheduler_error()
                Queue.objects.handle_job_failure(job_exe.id, now(), error)

        # Add currently running task IDs to set to be reconciled
        try:
            self.recon_lock.acquire()
            for task_id in task_id_list:
                self.recon_set.add(task_id)
        finally:
            self.recon_lock.release()
Beispiel #2
0
    def statusUpdate(self, driver, status):
        '''
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        '''

        if self.debug:
            connect_remote_debug()

        status_str = utils.status_to_string(status.state)
        task_id = status.task_id.value
        job_exe_id = ScaleJobExecution.get_job_exe_id(task_id)
        logger.info('Status update for task %s: %s', task_id, status_str)

        # Got a status update, so remove task from reconciliation set
        try:
            self.recon_lock.acquire()
            if task_id in self.recon_set:
                self.recon_set.remove(task_id)
        finally:
            self.recon_lock.release()

        try:
            scale_job_exe = self._get_job_exe(job_exe_id)
            if not scale_job_exe:
                # Scheduler doesn't have any knowledge of this job execution
                error = get_scheduler_error()
                Queue.objects.handle_job_failure(job_exe_id, now(), error)
                return

            if status.state == mesos_pb2.TASK_RUNNING:
                scale_job_exe.task_running(task_id, status)
            elif status.state == mesos_pb2.TASK_FINISHED:
                scale_job_exe.task_completed(task_id, status)
            elif status.state in [mesos_pb2.TASK_LOST, mesos_pb2.TASK_ERROR,
                                  mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]:
                # The task had an error so job execution is failed
                scale_job_exe.task_failed(task_id, status)
            if scale_job_exe.is_finished():
                # No more tasks so job execution is completed
                self._delete_job_exe(scale_job_exe)
        except:
            logger.exception('Error handling status update for job execution: %s', job_exe_id)
            # Error handling status update, add task so it can be reconciled
            try:
                self.recon_lock.acquire()
                self.recon_set.add(task_id)
            finally:
                self.recon_lock.release()