def test_killTask(self): # noqa proxy_driver = ProxyDriver() class ProvidedThermosRunnerMatcher(object): """Matcher that ensures a bound method 'stop' from 'ProvidedThermosTaskRunner' is called.""" def __eq__(self, other): return (type(other.im_self).__name__ == 'ProvidedThermosTaskRunner' and other.__name__ == 'stop') with contextlib.nested( temporary_dir(), mock.patch('apache.aurora.executor.aurora_executor.propagate_deadline', wraps=propagate_deadline)) as (checkpoint_root, mock_propagate_deadline): _, executor = make_executor( proxy_driver, checkpoint_root, SLEEP60_MTI, stop_timeout_in_secs=123) # send two, expect at most one delivered executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.terminated.wait() updates = proxy_driver.method_calls['sendStatusUpdate'] mock_propagate_deadline.assert_called_with( # Ensure 'stop' is called with custom timeout. ProvidedThermosRunnerMatcher(), timeout=Amount(123, Time.SECONDS)) assert len(updates) == 3 assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
def _sync_running_job_executions(self): """Syncs job executions that are currently running by handling any canceled or timed out executions """ running_job_exes = {} for job_exe in self._job_exe_manager.get_all_job_exes(): running_job_exes[job_exe.id] = job_exe right_now = now() for job_exe_model in JobExecution.objects.filter(id__in=running_job_exes.keys()).iterator(): running_job_exe = running_job_exes[job_exe_model.id] task_to_kill = None if job_exe_model.status == 'CANCELED': task_to_kill = running_job_exe.execution_canceled() elif job_exe_model.is_timed_out(right_now): try: task_to_kill = running_job_exe.execution_timed_out(right_now) except DatabaseError: logger.exception('Error failing timed out job execution %i', running_job_exe.id) if task_to_kill: pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(running_job_exe.id)
def test_killTask(self): # noqa proxy_driver = ProxyDriver() with temporary_dir() as checkpoint_root: _, executor = make_executor(proxy_driver, checkpoint_root, SLEEP60_MTI) # send two, expect at most one delivered executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001')) executor.terminated.wait() updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 3 assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
def test_killTask_during_runner_initialize(self): # noqa proxy_driver = ProxyDriver() task = make_task(HELLO_WORLD_MTI) with temporary_dir() as td: te = FastThermosExecutor(runner_provider=make_provider(td), sandbox_provider=SlowSandboxProvider()) te.launchTask(proxy_driver, task) te.sandbox_initialized.wait() te.killTask(proxy_driver, mesos_pb2.TaskID(value=task.task_id.value)) assert te.runner_aborted.is_set() assert not te.sandbox_created.is_set() # we've simulated a "slow" initialization by blocking it until the killTask was sent - so now, # trigger the initialization to complete te._sandbox._init_start.set() # however, wait on the runner to definitely finish its initialization before continuing # (otherwise, this function races ahead too fast) te._sandbox._init_done.wait() te.sandbox_created.wait() assert te.sandbox_initialized.is_set() assert te.sandbox_created.is_set() proxy_driver.wait_stopped() updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 2 assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
def cleanup_task(): """ Runs cleanup tasks i.e. Kill tasks for Agents which have disappeared, reconcile running task states and get new node lists from the master. """ print "Reloading slaves @ %s" % time.ctime() self.scheduler_lock.acquire() self.scheduler.update() if self.stored_driver is not None: unmonitor = self.scheduler.unmonitor for slave_id, _ in unmonitor.iteritems(): task_id = mesos_pb2.TaskID() task_id.value = slave_id if slave_id in self.scheduler.running or slave_id in self.scheduler.staging: print "Killing task %s" % task_id.value self.stored_driver.killTask(task_id) # TODO(nnielsen): Introduce retry for task killing. self.scheduler_lock.release() threading.Timer(1, cleanup_task).start()
def killBatchJobs(self, jobIDs): """ Kills the given batchjob IDs. """ localSet = set() if self.driver is None: raise RuntimeError("There is no scheduler driver") for jobID in jobIDs: log.debug("passing tasks to kill to Mesos driver") self.killSet.add(jobID) localSet.add(jobID) if jobID not in self.getIssuedBatchJobIDs(): self.killSet.remove(jobID) localSet.remove(jobID) log.debug("Batchjob %s already finished", jobID) else: taskId = mesos_pb2.TaskID() taskId.value = str(jobID) self.driver.killTask(taskId) while localSet: log.debug("in while loop") intersection = localSet.intersection(self.killedSet) localSet -= intersection self.killedSet -= intersection if not intersection: log.debug("sleeping in the while") time.sleep(1)
def _create_task(tid, offer, command, ns): """ `tid` (str) task id `offer` a mesos Offer instance `ns.mesos_task_resources` the stuff a task would consume: { "cpus": 10, "mem": 1, "disk": 12, "ports": [(20, 34), (35, 35)], "disks": ["sda1"] } `ns.docker_image` (str|None) a docker image you wish to execute the command in `ns.volumes` a list of volumes that get mounted into the container: [ ("host_path", "container_path", "mode"), ("/my/directory", "/path/on/container", "ro") ] """ task = dict( task_id=mesos_pb2.TaskID(value=tid), slave_id=offer.slave_id, command=mesos_pb2.CommandInfo( value=command, uris=[mesos_pb2.CommandInfo.URI(value=uri) for uri in ns.uris], environment=mesos_pb2.Environment(variables=[ mesos_pb2.Environment.Variable(name=k, value=v) for k, v in ns.mesos_environment ]))) if ns.mesos_framework_name: task.update(name="relay.mesos task: %s: %s" % (ns.mesos_framework_name, tid)) else: task.update(name="relay.mesos task: %s" % tid) # ability to inject os.environ values into the command if ns.docker_image: volumes = [ mesos_pb2.Volume(host_path=host_path, container_path=container_path, mode=mesos_pb2.Volume.Mode.Value(mode.upper())) for host_path, container_path, mode in ns.volumes ] task.update(container=mesos_pb2.ContainerInfo( type=mesos_pb2.ContainerInfo.DOCKER, volumes=volumes, docker=mesos_pb2.ContainerInfo.DockerInfo( image=ns.docker_image, force_pull_image=ns.force_pull_image, network=mesos_pb2.ContainerInfo.DockerInfo.Network.Value( ns.docker_network), parameters=[ mesos_pb2.Parameter(key=k, value=v) for k, v in ns.docker_parameters.items() ], ))) task = mesos_pb2.TaskInfo(**task) _create_task_add_task_resources(task, ns) return task
def jobFinished(self, job): logger.debug("job %s finished", job.id) if job.id in self.activeJobs: del self.activeJobs[job.id] self.activeJobsQueue.remove(job) for tid in self.jobTasks[job.id]: self.driver.killTask(mesos_pb2.TaskID(value=tid)) del self.jobTasks[job.id] self.last_finish_time = time.time() if not self.activeJobs: self.slaveTasks.clear() for tid, jid in self.taskIdToJobId.iteritems(): if jid not in self.activeJobs: logger.debug('kill task %s, because it is orphan', tid) self.driver.killTask(mesos_pb2.TaskID(value=tid))
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug("status update: %s %s", tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == mesos_pb2.TASK_RUNNING: if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(mesos_pb2.TaskID(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToSlaveId: slave_id = self.taskIdToSlaveId[tid] if slave_id in self.slaveTasks: self.slaveTasks[slave_id] -= 1 del self.taskIdToSlaveId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] if state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED) and status.data: try: reason, result, accUpdate = cPickle.loads(status.data) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception, e: logger.warning("error when cPickle.loads(): %s, data:%s", e, len(status.data)) state = mesos_pb2.TASK_FAILED return job.statusUpdate(task_id, tried, mesos_pb2.TASK_FAILED, 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate)
def _kill_tasks(self): """Sends kill messages for any tasks that need to be stopped """ for task in system_task_mgr.get_tasks_to_kill(): # Send kill message for system task pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task.id logger.info('Killing task %s', task.id) self._driver.killTask(pb_task_to_kill)
def cancel(self, proc): if self.driver.aborted: raise RuntimeError('driver already aborted') with self._lock: if proc.id in self.procs_pending: del self.procs_pending[proc.id] elif proc.id in self.procs_launched: del self.procs_launched[proc.id] self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id))) for slave_id, procs in self.slave_to_proc.items(): procs.pop(proc.id) if not procs: del self.slave_to_proc[slave_id]
def kill(self, password): """ Kill the cluster. NOTE: Cluster killing is asynchronous. Use 'terminated' property to check if all tasks in the cluster are killed. """ with self._lock: if not self._password_box.match(password, self._cluster.encrypted_password): raise self.PermissionError("No permission to kill cluster %s" % self.cluster_name) self._terminating = True # TODO(jyx): Task killing is unreliable. Reconciliation should retry killing. for task_id in self._cluster.tasks: log.info("Killing task %s of cluster %s" % (task_id, self.cluster_name)) self._driver.killTask(mesos_pb2.TaskID(value=task_id))
def _perform_sync(self): """Performs the sync with the database """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() mesos_master = scheduler_mgr.mesos_address node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port) # Kill running tasks for canceled job executions for task_to_kill in job_exe_mgr.sync_with_database(): pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def _execute(self): """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute` """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr.update_nodes(node_mgr.get_nodes()) mesos_master = scheduler_mgr.mesos_address resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port) # Kill running tasks for canceled job executions for task_to_kill in job_exe_mgr.sync_with_database(): pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def _timeout_tasks(self, when): """Handles any tasks that have exceeded their time out thresholds :param when: The current time :type when: :class:`datetime.datetime` """ # Time out tasks that have exceeded thresholds for task in task_mgr.get_timeout_tasks(when): # Handle task timeout based on the type of the task if task.id.startswith(JOB_TASK_ID_PREFIX): # Job task, notify job execution manager job_exe_mgr.handle_task_timeout(task, when) else: # Not a job task, so must be a node task node_mgr.handle_task_timeout(task) # Send kill message for timed out task pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task.id logger.info('Killing task %s', task.id) self._driver.killTask(pb_task_to_kill)
def killBatchJobs(self, jobIDs): # FIXME: probably still racy assert self.driver is not None localSet = set() for jobID in jobIDs: self.killJobIds.add(jobID) localSet.add(jobID) self.intendedKill.add(jobID) # FIXME: a bit too expensive for my taste if jobID in self.getIssuedBatchJobIDs(): taskId = mesos_pb2.TaskID() taskId.value = str(jobID) self.driver.killTask(taskId) else: self.killJobIds.remove(jobID) localSet.remove(jobID) while localSet: intersection = localSet.intersection(self.killedJobIds) if intersection: localSet -= intersection self.killedJobIds -= intersection else: time.sleep(1)
def test_executor_event_handlers(mocker): executor = mocker.Mock() driver = mocker.Mock() proxy = ExecutorProxy(executor) proxy.registered(driver, mesos_pb2.ExecutorInfo(), mesos_pb2.FrameworkInfo(), mesos_pb2.SlaveInfo()) proxy.reregistered(driver, mesos_pb2.SlaveInfo()) proxy.disconnected(driver) proxy.launchTask(driver, mesos_pb2.TaskInfo()) proxy.killTask(driver, mesos_pb2.TaskID()) proxy.frameworkMessage(driver, 'message') proxy.shutdown(driver) proxy.error(driver, 'message') executor.on_registered.assert_called_once() executor.on_reregistered.assert_called_once() executor.on_disconnected.assert_called_once() executor.on_launch.assert_called_once() executor.on_kill.assert_called_once() executor.on_message.assert_called_once() executor.on_shutdown.assert_called_once() executor.on_error.assert_called_once()
def kill_task(self, driver, t): task_id = mesos_pb2.TaskID() task_id.value = "%s-%s" % (t.id, t.tried) driver.killTask(task_id)
def kill_task(self, driver, task): tid = mesos_pb2.TaskID() tid.value = task driver.killTask(tid) self.tasks_with_flags[task].mesos_task_state = TASK_KILLING
def killTask(): taskID = request.args.get('taskID') print taskID driver.killTask(mesos_pb2.TaskID(value=taskID)) return "Success"
def deleteTask(self, instance): task_id = mesos_pb2.TaskID() task_id.value = instance.task_id driver.killTask(task_id)
def killTask(self, job_id, task_id, tried): tid = mesos_pb2.TaskID() tid.value = "%s:%s:%s" % (job_id, task_id, tried) self.driver.killTask(tid)