def executorLost(self, driver, executorId, slaveId, status): """ Invoked when an executor has exited/terminated. Note that any tasks running will have TASK_LOST status updates automatically generated. See documentation for :meth:`mesos_api.mesos.Scheduler.executorLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.warning('Executor %s lost on host: %s', executorId.value, node.hostname) else: logger.warning('Executor %s lost on agent: %s', executorId.value, agent_id) duration = now() - started msg = 'Scheduler executorLost() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.warning('Node lost on host %s', node.hostname) else: logger.warning('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) resource_mgr.lost_agent(agent_id) # Fail job executions that were running on the lost node if node: for finished_job_exe in job_exe_mgr.lost_node(node.id, started): cleanup_mgr.add_job_execution(finished_job_exe) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. These messages are best effort; do not expect a framework message to be retransmitted in any reliable fashion. See documentation for :meth:`mesos_api.mesos.Scheduler.frameworkMessage`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.info('Message from %s on host %s: %s', executorId.value, node.hostname, message) else: logger.info('Message from %s on agent %s: %s', executorId.value, agent_id, message) duration = now() - started msg = 'Scheduler frameworkMessage() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. These messages are best effort; do not expect a framework message to be retransmitted in any reliable fashion. See documentation for :meth:`mesos_api.mesos.Scheduler.frameworkMessage`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.info('Message from %s on host %s: %s', executorId.value, node.hostname, message) else: logger.info('Message from %s on agent %s: %s', executorId.value, agent_id, message) duration = now() - started msg = 'Scheduler frameworkMessage() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.error('Node lost on host %s', node.hostname) else: logger.error('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) offer_mgr.lost_node(agent_id) # Fail job executions that were running on the lost node if node: for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id): try: running_job_exe.execution_lost(started) except DatabaseError: logger.exception('Error failing lost job execution: %s', running_job_exe.id) # Error failing execution, add task so it can be reconciled task = running_job_exe.current_task if task: recon_mgr.add_task_ids([task.id]) if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(running_job_exe.id) cleanup_mgr.add_job_execution(running_job_exe) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def executorLost(self, driver, executorId, slaveId, status): """ Invoked when an executor has exited/terminated. Note that any tasks running will have TASK_LOST status updates automatically generated. See documentation for :meth:`mesos_api.mesos.Scheduler.executorLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.error('Executor %s lost on host: %s', executorId.value, node.hostname) else: logger.error('Executor %s lost on agent: %s', executorId.value, agent_id) duration = now() - started msg = 'Scheduler executorLost() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())