Beispiel #1
0
    def _sync_running_job_executions(self):
        """Syncs job executions that are currently running by handling any canceled or timed out executions
        """

        running_job_exes = {}
        for job_exe in running_job_mgr.get_all_job_exes():
            running_job_exes[job_exe.id] = job_exe

        right_now = now()

        for job_exe_model in JobExecution.objects.filter(id__in=running_job_exes.keys()).iterator():
            running_job_exe = running_job_exes[job_exe_model.id]
            task_to_kill = None

            if job_exe_model.status == 'CANCELED':
                try:
                    task_to_kill = running_job_exe.execution_canceled()
                except DatabaseError:
                    logger.exception('Error canceling job execution %i', running_job_exe.id)
            elif job_exe_model.is_timed_out(right_now):
                try:
                    task_to_kill = running_job_exe.execution_timed_out(right_now)
                except DatabaseError:
                    logger.exception('Error failing timed out job execution %i', running_job_exe.id)

            if task_to_kill:
                pb_task_to_kill = mesos_pb2.TaskID()
                pb_task_to_kill.value = task_to_kill.id
                logger.info('Killing task %s', task_to_kill.id)
                self._driver.killTask(pb_task_to_kill)

            if running_job_exe.is_finished():
                running_job_mgr.remove_job_exe(running_job_exe.id)
                cleanup_mgr.add_job_execution(running_job_exe)
Beispiel #2
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.warning('Node lost on host %s', node.hostname)
        else:
            logger.warning('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        resource_mgr.lost_agent(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for finished_job_exe in job_exe_mgr.lost_node(node.id, started):
                cleanup_mgr.add_job_execution(finished_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #3
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.error('Node lost on host %s', node.hostname)
        else:
            logger.error('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        offer_mgr.lost_node(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id):
                try:
                    running_job_exe.execution_lost(started)
                except DatabaseError:
                    logger.exception('Error failing lost job execution: %s', running_job_exe.id)
                    # Error failing execution, add task so it can be reconciled
                    task = running_job_exe.current_task
                    if task:
                        recon_mgr.add_task_ids([task.id])
                if running_job_exe.is_finished():
                    running_job_mgr.remove_job_exe(running_job_exe.id)
                    cleanup_mgr.add_job_execution(running_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #4
0
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        job_exe_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Handle canceled job executions
        for finished_job_exe in job_exe_mgr.sync_with_database():
            cleanup_mgr.add_job_execution(finished_job_exe)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Beispiel #5
0
    def _schedule_waiting_tasks(self, nodes, running_job_exes, when):
        """Schedules all waiting tasks for which there are sufficient resources and updates the resource manager with
        any resource shortages. All scheduling nodes that have fulfilled all of their waiting tasks will be returned so
        new job executions can be added to them.

        :param nodes: The dict of scheduling nodes stored by node ID
        :type nodes: dict
        :param running_job_exes: The currently running job executions
        :type running_job_exes: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: The dict of scheduling nodes stored by node ID that have no more waiting tasks
        :rtype: dict
        """

        fulfilled_nodes = {}  # {Node ID: SchedulingNode}
        waiting_tasks = []

        # Schedule waiting node tasks first
        for node in nodes.values():
            has_waiting_tasks = node.accept_node_tasks(when, waiting_tasks)
            if node.is_ready_for_next_job_task and not has_waiting_tasks:
                # A node can only be fulfilled if it is able to run waiting tasks and it has no more waiting tasks
                fulfilled_nodes[node.node_id] = node

        # Schedule job executions already on the node waiting for their next task
        node_lost_job_exes_ids = []
        for running_job_exe in running_job_exes:
            if running_job_exe.node_id not in nodes:  # Unknown/lost node
                node_lost_job_exes_ids.append(running_job_exe.id)
            else:
                node = nodes[running_job_exe.node_id]
                if not node.is_ready_for_next_job_task or node.agent_id != running_job_exe.agent_id:
                    # Node is deprecated, offline, or has switched agent IDs
                    node_lost_job_exes_ids.append(running_job_exe.id)
                elif running_job_exe.is_next_task_ready():
                    has_waiting_tasks = node.accept_job_exe_next_task(
                        running_job_exe, waiting_tasks)
                    if has_waiting_tasks and node.node_id in fulfilled_nodes:
                        # Node has tasks waiting for resources
                        del fulfilled_nodes[node.node_id]
        # Handle any running job executions that have lost their node or become starved
        finished_job_exes = job_exe_mgr.check_for_starvation(when)
        if node_lost_job_exes_ids:
            finished_job_exes.extend(
                job_exe_mgr.lost_job_exes(node_lost_job_exes_ids, when))
        for finished_job_exe in finished_job_exes:
            cleanup_mgr.add_job_execution(finished_job_exe)

        # Update waiting task counts and calculate shortages
        agent_shortages = {}  # {Agent ID: NodeResources}
        new_waiting_tasks = {}  # {Task ID: int}
        for task in waiting_tasks:
            if task.id in self._waiting_tasks:
                count = self._waiting_tasks[task.id] + 1
            else:
                count = 1
            new_waiting_tasks[task.id] = count
            if count >= TASK_SHORTAGE_WAIT_COUNT:
                # This task has waited too long for resources, generate a shortage
                if task.agent_id in agent_shortages:
                    agent_shortages[task.agent_id].add(task.get_resources())
                else:
                    resources = NodeResources()
                    resources.add(task.get_resources())
                    agent_shortages[task.agent_id] = resources
        self._waiting_tasks = new_waiting_tasks
        resource_mgr.set_agent_shortages(agent_shortages)

        return fulfilled_nodes
Beispiel #6
0
    def update(self, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status))
        task_id = task_update.task_id
        was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES
        was_job_finished = False

        if mesos_status == 'TASK_ERROR':
            logger.error('Status update for task %s: %s', task_id, mesos_status)
        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Grab job execution ID from manager
            cluster_id = JobExecution.parse_cluster_id(task_id)
            job_exe = job_exe_mgr.get_running_job_exe(cluster_id)
            if job_exe:
                model.job_exe_id = job_exe.id
        task_update_mgr.add_task_update(model)

        # Update task with latest status
        # This should happen before the job execution or node manager are updated, since they will assume that the task
        # has already been updated
        task_mgr.handle_task_update(task_update)

        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Job task, so update the job execution
            try:
                job_exe = job_exe_mgr.handle_task_update(task_update)
                if job_exe and job_exe.is_finished():
                    logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id)
                    was_job_finished = True
                    cleanup_mgr.add_job_execution(job_exe)
                    GPUManager.release_gpus(job_exe.node_id, job_exe.job_id)

            except Exception:
                cluster_id = JobExecution.parse_cluster_id(task_id)
                logger.exception('Error handling status update for job execution: %s', cluster_id)
                # Error handling status update, add task so it can be reconciled
                task = task_mgr.get_task(task_id)
                if task:
                    recon_mgr.add_tasks([task])
        else:
            # Not a job task, so must be either a node or system task
            node_mgr.handle_task_update(task_update)
            system_task_mgr.handle_task_update(task_update)

        scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished)

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #7
0
    def statusUpdate(self, driver, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.

        See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status))
        task_id = task_update.task_id

        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        task_update_mgr.add_task_update(model)

        if task_id.startswith(CLEANUP_TASK_ID_PREFIX):
            cleanup_mgr.handle_task_update(task_update)
        else:
            job_exe_id = JobExecution.get_job_exe_id(task_id)

            try:
                running_job_exe = running_job_mgr.get_job_exe(job_exe_id)

                if running_job_exe:
                    running_job_exe.task_update(task_update)

                    # Remove finished job execution
                    if running_job_exe.is_finished():
                        running_job_mgr.remove_job_exe(job_exe_id)
                        cleanup_mgr.add_job_execution(running_job_exe)
                else:
                    # Scheduler doesn't have any knowledge of this job execution
                    Queue.objects.handle_job_failure(job_exe_id, now(), [],
                                                     Error.objects.get_builtin_error('scheduler-lost'))
            except Exception:
                logger.exception('Error handling status update for job execution: %s', job_exe_id)
                # Error handling status update, add task so it can be reconciled
                recon_mgr.add_task_ids([task_id])

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())