def _sync_running_job_executions(self): """Syncs job executions that are currently running by handling any canceled or timed out executions """ running_job_exes = {} for job_exe in running_job_mgr.get_all_job_exes(): running_job_exes[job_exe.id] = job_exe right_now = now() for job_exe_model in JobExecution.objects.filter(id__in=running_job_exes.keys()).iterator(): running_job_exe = running_job_exes[job_exe_model.id] task_to_kill = None if job_exe_model.status == 'CANCELED': try: task_to_kill = running_job_exe.execution_canceled() except DatabaseError: logger.exception('Error canceling job execution %i', running_job_exe.id) elif job_exe_model.is_timed_out(right_now): try: task_to_kill = running_job_exe.execution_timed_out(right_now) except DatabaseError: logger.exception('Error failing timed out job execution %i', running_job_exe.id) if task_to_kill: pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(running_job_exe.id) cleanup_mgr.add_job_execution(running_job_exe)
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.error('Node lost on host %s', node.hostname) else: logger.error('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) offer_mgr.lost_node(agent_id) # Fail job executions that were running on the lost node if node: for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id): try: running_job_exe.execution_lost(started) except DatabaseError: logger.exception('Error failing lost job execution: %s', running_job_exe.id) # Error failing execution, add task so it can be reconciled task = running_job_exe.current_task if task: recon_mgr.add_task_ids([task.id]) if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(running_job_exe.id) cleanup_mgr.add_job_execution(running_job_exe) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def statusUpdate(self, driver, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. """ started = now() model = utils.create_task_update_model(status) mesos_status = model.status task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status)) task_id = task_update.task_id if mesos_status == 'TASK_LOST': logger.warning('Status update for task %s: %s', task_id, mesos_status) else: logger.info('Status update for task %s: %s', task_id, mesos_status) # Since we have a status update for this task, remove it from reconciliation set recon_mgr.remove_task_id(task_id) # Hand off task update to be saved in the database task_update_mgr.add_task_update(model) if task_id.startswith(CLEANUP_TASK_ID_PREFIX): cleanup_mgr.handle_task_update(task_update) else: job_exe_id = JobExecution.get_job_exe_id(task_id) try: running_job_exe = running_job_mgr.get_job_exe(job_exe_id) if running_job_exe: running_job_exe.task_update(task_update) # Remove finished job execution if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(job_exe_id) cleanup_mgr.add_job_execution(running_job_exe) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure(job_exe_id, now(), [], Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception('Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled recon_mgr.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())