Example #1
0
    def _send_tasks_for_reconciliation(self):
        """Sends the IDs of any tasks that need to be reconciled
        """

        when = now()
        task_ids = cleanup_mgr.get_task_ids_for_reconciliation(when)
        task_ids.extend(running_job_mgr.get_task_ids_for_reconciliation(when))
        recon_mgr.add_task_ids(task_ids)
Example #2
0
    def _reconcile_tasks(self, when):
        """Sends any tasks that need to be reconciled to the reconciliation manager

        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        task_ids = []
        for task in task_mgr.get_tasks_to_reconcile(when):
            task_ids.append(task.id)
        recon_mgr.add_task_ids(task_ids)
Example #3
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.error('Node lost on host %s', node.hostname)
        else:
            logger.error('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        offer_mgr.lost_node(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id):
                try:
                    running_job_exe.execution_lost(started)
                except DatabaseError:
                    logger.exception('Error failing lost job execution: %s', running_job_exe.id)
                    # Error failing execution, add task so it can be reconciled
                    task = running_job_exe.current_task
                    if task:
                        recon_mgr.add_task_ids([task.id])
                if running_job_exe.is_finished():
                    running_job_mgr.remove_job_exe(running_job_exe.id)
                    cleanup_mgr.add_job_execution(running_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Example #4
0
    def _reconcile_running_jobs(self):
        """Looks up all currently running jobs in the database and sets them up to be reconciled with Mesos"""

        # List of task IDs to reconcile
        task_ids = []

        # Query for job executions that are running
        job_exes = JobExecution.objects.get_running_job_exes()

        # Find current task IDs for running executions
        for job_exe in job_exes:
            running_job_exe = running_job_mgr.get_job_exe(job_exe.id)
            if running_job_exe:
                task = running_job_exe.current_task
                if task:
                    task_ids.append(task.id)
            else:
                # Fail any executions that the scheduler has lost
                Queue.objects.handle_job_failure(job_exe.id, now(), [],
                                                 Error.objects.get_builtin_error('scheduler-lost'))

        # Send task IDs to reconciliation thread
        recon_mgr.add_task_ids(task_ids)
Example #5
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status))
        task_id = task_update.task_id

        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        task_update_mgr.add_task_update(model)

        if task_id.startswith(CLEANUP_TASK_ID_PREFIX):
            cleanup_mgr.handle_task_update(task_update)
        else:
            job_exe_id = JobExecution.get_job_exe_id(task_id)

            try:
                running_job_exe = running_job_mgr.get_job_exe(job_exe_id)

                if running_job_exe:
                    running_job_exe.task_update(task_update)

                    # Remove finished job execution
                    if running_job_exe.is_finished():
                        running_job_mgr.remove_job_exe(job_exe_id)
                        cleanup_mgr.add_job_execution(running_job_exe)
                else:
                    # Scheduler doesn't have any knowledge of this job execution
                    Queue.objects.handle_job_failure(job_exe_id, now(), [],
                                                     Error.objects.get_builtin_error('scheduler-lost'))
            except Exception:
                logger.exception('Error handling status update for job execution: %s', job_exe_id)
                # Error handling status update, add task so it can be reconciled
                recon_mgr.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())