Esempio n. 1
0
    def executorLost(self, driver, executorId, slaveId, status):
        """
        Invoked when an executor has exited/terminated. Note that any tasks
        running will have TASK_LOST status updates automatically generated.

        See documentation for :meth:`mesos_api.mesos.Scheduler.executorLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.warning('Executor %s lost on host: %s', executorId.value,
                           node.hostname)
        else:
            logger.warning('Executor %s lost on agent: %s', executorId.value,
                           agent_id)

        duration = now() - started
        msg = 'Scheduler executorLost() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Esempio n. 2
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.warning('Node lost on host %s', node.hostname)
        else:
            logger.warning('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        resource_mgr.lost_agent(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for finished_job_exe in job_exe_mgr.lost_node(node.id, started):
                cleanup_mgr.add_job_execution(finished_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Esempio n. 3
0
    def frameworkMessage(self, driver, executorId, slaveId, message):
        """
        Invoked when an executor sends a message. These messages are best
        effort; do not expect a framework message to be retransmitted in any
        reliable fashion.

        See documentation for :meth:`mesos_api.mesos.Scheduler.frameworkMessage`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.info('Message from %s on host %s: %s', executorId.value,
                        node.hostname, message)
        else:
            logger.info('Message from %s on agent %s: %s', executorId.value,
                        agent_id, message)

        duration = now() - started
        msg = 'Scheduler frameworkMessage() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Esempio n. 4
0
    def frameworkMessage(self, driver, executorId, slaveId, message):
        """
        Invoked when an executor sends a message. These messages are best
        effort; do not expect a framework message to be retransmitted in any
        reliable fashion.

        See documentation for :meth:`mesos_api.mesos.Scheduler.frameworkMessage`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.info('Message from %s on host %s: %s', executorId.value, node.hostname, message)
        else:
            logger.info('Message from %s on agent %s: %s', executorId.value, agent_id, message)

        duration = now() - started
        msg = 'Scheduler frameworkMessage() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Esempio n. 5
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.error('Node lost on host %s', node.hostname)
        else:
            logger.error('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        offer_mgr.lost_node(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id):
                try:
                    running_job_exe.execution_lost(started)
                except DatabaseError:
                    logger.exception('Error failing lost job execution: %s', running_job_exe.id)
                    # Error failing execution, add task so it can be reconciled
                    task = running_job_exe.current_task
                    if task:
                        recon_mgr.add_task_ids([task.id])
                if running_job_exe.is_finished():
                    running_job_mgr.remove_job_exe(running_job_exe.id)
                    cleanup_mgr.add_job_execution(running_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Esempio n. 6
0
    def executorLost(self, driver, executorId, slaveId, status):
        """
        Invoked when an executor has exited/terminated. Note that any tasks
        running will have TASK_LOST status updates automatically generated.

        See documentation for :meth:`mesos_api.mesos.Scheduler.executorLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.error('Executor %s lost on host: %s', executorId.value, node.hostname)
        else:
            logger.error('Executor %s lost on agent: %s', executorId.value, agent_id)

        duration = now() - started
        msg = 'Scheduler executorLost() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())