def _reconcile_tasks(self, when): """Sends any tasks that need to be reconciled to the reconciliation manager :param when: The current time :type when: :class:`datetime.datetime` """ recon_mgr.add_tasks(task_mgr.get_tasks_to_reconcile(when))
def _reconcile_running_jobs(self): """Reconciles all currently running job executions with Mesos""" # List of tasks to reconcile tasks_to_reconcile = [] # Find current tasks for running executions for running_job_exe in job_exe_mgr.get_running_job_exes(): task = running_job_exe.current_task if task: tasks_to_reconcile.append(task) # Send tasks to reconciliation thread recon_mgr.add_tasks(tasks_to_reconcile)
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.warning('Node lost on host %s', node.hostname) else: logger.warning('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) resource_mgr.lost_agent(agent_id) # Fail job executions that were running on the lost node if node: lost_exes = job_exe_mgr.lost_node(node.id, started) tasks_to_reconcile = [] for lost_exe in lost_exes: # Reconcile lost tasks lost_task = lost_exe.current_task if lost_task: tasks_to_reconcile.append(lost_task) cleanup_mgr.add_job_execution(lost_exe) if tasks_to_reconcile: recon_mgr.add_tasks(tasks_to_reconcile) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def update(self, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. """ started = now() model = utils.create_task_update_model(status) mesos_status = model.status task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status)) task_id = task_update.task_id was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES was_job_finished = False if mesos_status == 'TASK_ERROR': logger.error('Status update for task %s: %s', task_id, mesos_status) if mesos_status == 'TASK_LOST': logger.warning('Status update for task %s: %s', task_id, mesos_status) else: logger.info('Status update for task %s: %s', task_id, mesos_status) # Since we have a status update for this task, remove it from reconciliation set recon_mgr.remove_task_id(task_id) # Hand off task update to be saved in the database if task_id.startswith(JOB_TASK_ID_PREFIX): # Grab job execution ID from manager cluster_id = JobExecution.parse_cluster_id(task_id) job_exe = job_exe_mgr.get_running_job_exe(cluster_id) if job_exe: model.job_exe_id = job_exe.id task_update_mgr.add_task_update(model) # Update task with latest status # This should happen before the job execution or node manager are updated, since they will assume that the task # has already been updated task_mgr.handle_task_update(task_update) if task_id.startswith(JOB_TASK_ID_PREFIX): # Job task, so update the job execution try: job_exe = job_exe_mgr.handle_task_update(task_update) if job_exe and job_exe.is_finished(): logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id) was_job_finished = True cleanup_mgr.add_job_execution(job_exe) GPUManager.release_gpus(job_exe.node_id, job_exe.job_id) except Exception: cluster_id = JobExecution.parse_cluster_id(task_id) logger.exception('Error handling status update for job execution: %s', cluster_id) # Error handling status update, add task so it can be reconciled task = task_mgr.get_task(task_id) if task: recon_mgr.add_tasks([task]) else: # Not a job task, so must be either a node or system task node_mgr.handle_task_update(task_update) system_task_mgr.handle_task_update(task_update) scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())