Example #1
0
  def test_killTask(self):  # noqa
    proxy_driver = ProxyDriver()

    class ProvidedThermosRunnerMatcher(object):
      """Matcher that ensures a bound method 'stop' from 'ProvidedThermosTaskRunner' is called."""

      def __eq__(self, other):
        return (type(other.im_self).__name__ == 'ProvidedThermosTaskRunner'
            and other.__name__ == 'stop')

    with contextlib.nested(
        temporary_dir(),
        mock.patch('apache.aurora.executor.aurora_executor.propagate_deadline',
            wraps=propagate_deadline)) as (checkpoint_root, mock_propagate_deadline):

      _, executor = make_executor(
          proxy_driver,
          checkpoint_root,
          SLEEP60_MTI,
          stop_timeout_in_secs=123)
      # send two, expect at most one delivered
      executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001'))
      executor.killTask(proxy_driver, mesos_pb2.TaskID(value='sleep60-001'))
      executor.terminated.wait()

      updates = proxy_driver.method_calls['sendStatusUpdate']

      mock_propagate_deadline.assert_called_with(  # Ensure 'stop' is called with custom timeout.
          ProvidedThermosRunnerMatcher(),
          timeout=Amount(123, Time.SECONDS))
      assert len(updates) == 3
      assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
Example #2
0
    def _sync_running_job_executions(self):
        """Syncs job executions that are currently running by handling any canceled or timed out executions
        """

        running_job_exes = {}
        for job_exe in self._job_exe_manager.get_all_job_exes():
            running_job_exes[job_exe.id] = job_exe

        right_now = now()

        for job_exe_model in JobExecution.objects.filter(id__in=running_job_exes.keys()).iterator():
            running_job_exe = running_job_exes[job_exe_model.id]
            task_to_kill = None

            if job_exe_model.status == 'CANCELED':
                task_to_kill = running_job_exe.execution_canceled()
            elif job_exe_model.is_timed_out(right_now):
                try:
                    task_to_kill = running_job_exe.execution_timed_out(right_now)
                except DatabaseError:
                    logger.exception('Error failing timed out job execution %i', running_job_exe.id)

            if task_to_kill:
                pb_task_to_kill = mesos_pb2.TaskID()
                pb_task_to_kill.value = task_to_kill.id
                logger.info('Killing task %s', task_to_kill.id)
                self._driver.killTask(pb_task_to_kill)

            if running_job_exe.is_finished():
                self._job_exe_manager.remove_job_exe(running_job_exe.id)
Example #3
0
    def test_killTask(self):  # noqa
        proxy_driver = ProxyDriver()

        with temporary_dir() as checkpoint_root:
            _, executor = make_executor(proxy_driver, checkpoint_root,
                                        SLEEP60_MTI)
            # send two, expect at most one delivered
            executor.killTask(proxy_driver,
                              mesos_pb2.TaskID(value='sleep60-001'))
            executor.killTask(proxy_driver,
                              mesos_pb2.TaskID(value='sleep60-001'))
            executor.terminated.wait()

        updates = proxy_driver.method_calls['sendStatusUpdate']
        assert len(updates) == 3
        assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
Example #4
0
    def test_killTask_during_runner_initialize(self):  # noqa
        proxy_driver = ProxyDriver()

        task = make_task(HELLO_WORLD_MTI)

        with temporary_dir() as td:
            te = FastThermosExecutor(runner_provider=make_provider(td),
                                     sandbox_provider=SlowSandboxProvider())
            te.launchTask(proxy_driver, task)
            te.sandbox_initialized.wait()
            te.killTask(proxy_driver,
                        mesos_pb2.TaskID(value=task.task_id.value))
            assert te.runner_aborted.is_set()
            assert not te.sandbox_created.is_set()

            # we've simulated a "slow" initialization by blocking it until the killTask was sent - so now,
            # trigger the initialization to complete
            te._sandbox._init_start.set()

            # however, wait on the runner to definitely finish its initialization before continuing
            # (otherwise, this function races ahead too fast)
            te._sandbox._init_done.wait()
            te.sandbox_created.wait()
            assert te.sandbox_initialized.is_set()
            assert te.sandbox_created.is_set()

            proxy_driver.wait_stopped()

            updates = proxy_driver.method_calls['sendStatusUpdate']
            assert len(updates) == 2
            assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
        def cleanup_task():
            """
            Runs cleanup tasks i.e. Kill tasks for Agents which have disappeared, reconcile running
            task states and get new node lists from the master.
            """

            print "Reloading slaves @ %s" % time.ctime()

            self.scheduler_lock.acquire()

            self.scheduler.update()

            if self.stored_driver is not None:
                unmonitor = self.scheduler.unmonitor
                for slave_id, _ in unmonitor.iteritems():
                    task_id = mesos_pb2.TaskID()
                    task_id.value = slave_id

                    if slave_id in self.scheduler.running or slave_id in self.scheduler.staging:
                        print "Killing task %s" % task_id.value
                        self.stored_driver.killTask(task_id)

                    # TODO(nnielsen): Introduce retry for task killing.

            self.scheduler_lock.release()

            threading.Timer(1, cleanup_task).start()
Example #6
0
    def killBatchJobs(self, jobIDs):
        """
        Kills the given batchjob IDs.
        """
        localSet = set()
        if self.driver is None:
            raise RuntimeError("There is no scheduler driver")
        for jobID in jobIDs:
            log.debug("passing tasks to kill to Mesos driver")
            self.killSet.add(jobID)
            localSet.add(jobID)

            if jobID not in self.getIssuedBatchJobIDs():
                self.killSet.remove(jobID)
                localSet.remove(jobID)
                log.debug("Batchjob %s already finished", jobID)
            else:
                taskId = mesos_pb2.TaskID()
                taskId.value = str(jobID)
                self.driver.killTask(taskId)

        while localSet:
            log.debug("in while loop")
            intersection = localSet.intersection(self.killedSet)
            localSet -= intersection
            self.killedSet -= intersection
            if not intersection:
                log.debug("sleeping in the while")
                time.sleep(1)
Example #7
0
def _create_task(tid, offer, command, ns):
    """
    `tid` (str) task id
    `offer` a mesos Offer instance
    `ns.mesos_task_resources` the stuff a task would consume:
        {
            "cpus": 10,
            "mem": 1,
            "disk": 12,
            "ports": [(20, 34), (35, 35)],
            "disks": ["sda1"]
        }
    `ns.docker_image` (str|None)
        a docker image you wish to execute the command in
    `ns.volumes` a list of volumes that get mounted into the container:
        [
          ("host_path", "container_path", "mode"),
          ("/my/directory", "/path/on/container", "ro")
        ]
    """
    task = dict(
        task_id=mesos_pb2.TaskID(value=tid),
        slave_id=offer.slave_id,
        command=mesos_pb2.CommandInfo(
            value=command,
            uris=[mesos_pb2.CommandInfo.URI(value=uri) for uri in ns.uris],
            environment=mesos_pb2.Environment(variables=[
                mesos_pb2.Environment.Variable(name=k, value=v)
                for k, v in ns.mesos_environment
            ])))
    if ns.mesos_framework_name:
        task.update(name="relay.mesos task: %s: %s" %
                    (ns.mesos_framework_name, tid))
    else:
        task.update(name="relay.mesos task: %s" % tid)
    # ability to inject os.environ values into the command
    if ns.docker_image:
        volumes = [
            mesos_pb2.Volume(host_path=host_path,
                             container_path=container_path,
                             mode=mesos_pb2.Volume.Mode.Value(mode.upper()))
            for host_path, container_path, mode in ns.volumes
        ]
        task.update(container=mesos_pb2.ContainerInfo(
            type=mesos_pb2.ContainerInfo.DOCKER,
            volumes=volumes,
            docker=mesos_pb2.ContainerInfo.DockerInfo(
                image=ns.docker_image,
                force_pull_image=ns.force_pull_image,
                network=mesos_pb2.ContainerInfo.DockerInfo.Network.Value(
                    ns.docker_network),
                parameters=[
                    mesos_pb2.Parameter(key=k, value=v)
                    for k, v in ns.docker_parameters.items()
                ],
            )))
    task = mesos_pb2.TaskInfo(**task)
    _create_task_add_task_resources(task, ns)
    return task
Example #8
0
    def jobFinished(self, job):
        logger.debug("job %s finished", job.id)
        if job.id in self.activeJobs:
            del self.activeJobs[job.id]
            self.activeJobsQueue.remove(job)
            for tid in self.jobTasks[job.id]:
                self.driver.killTask(mesos_pb2.TaskID(value=tid))
            del self.jobTasks[job.id]
            self.last_finish_time = time.time()

            if not self.activeJobs:
                self.slaveTasks.clear()

        for tid, jid in self.taskIdToJobId.iteritems():
            if jid not in self.activeJobs:
                logger.debug('kill task %s, because it is orphan', tid)
                self.driver.killTask(mesos_pb2.TaskID(value=tid))
Example #9
0
    def statusUpdate(self, driver, status):
        tid = status.task_id.value
        state = status.state
        logger.debug("status update: %s %s", tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = map(int, tid.split(':'))
        if state == mesos_pb2.TASK_RUNNING:
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(mesos_pb2.TaskID(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToSlaveId:
            slave_id = self.taskIdToSlaveId[tid]
            if slave_id in self.slaveTasks:
                self.slaveTasks[slave_id] -= 1
            del self.taskIdToSlaveId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        if state in (mesos_pb2.TASK_FINISHED,
                     mesos_pb2.TASK_FAILED) and status.data:
            try:
                reason, result, accUpdate = cPickle.loads(status.data)
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = cPickle.loads(data)
            except Exception, e:
                logger.warning("error when cPickle.loads(): %s, data:%s", e,
                               len(status.data))
                state = mesos_pb2.TASK_FAILED
                return job.statusUpdate(task_id, tried, mesos_pb2.TASK_FAILED,
                                        'load failed: %s' % e)
            else:
                return job.statusUpdate(task_id, tried, state, reason, result,
                                        accUpdate)
Example #10
0
    def _kill_tasks(self):
        """Sends kill messages for any tasks that need to be stopped
        """

        for task in system_task_mgr.get_tasks_to_kill():
            # Send kill message for system task
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task.id
            logger.info('Killing task %s', task.id)
            self._driver.killTask(pb_task_to_kill)
Example #11
0
    def cancel(self, proc):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        with self._lock:
            if proc.id in self.procs_pending:
                del self.procs_pending[proc.id]
            elif proc.id in self.procs_launched:
                del self.procs_launched[proc.id]
                self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id)))

            for slave_id, procs in self.slave_to_proc.items():
                procs.pop(proc.id)
                if not procs:
                    del self.slave_to_proc[slave_id]
Example #12
0
  def kill(self, password):
    """
      Kill the cluster.

      NOTE: Cluster killing is asynchronous. Use 'terminated' property to check if all tasks in the
      cluster are killed.
    """
    with self._lock:
      if not self._password_box.match(password, self._cluster.encrypted_password):
        raise self.PermissionError("No permission to kill cluster %s" % self.cluster_name)

      self._terminating = True

      # TODO(jyx): Task killing is unreliable. Reconciliation should retry killing.
      for task_id in self._cluster.tasks:
        log.info("Killing task %s of cluster %s" % (task_id, self.cluster_name))
        self._driver.killTask(mesos_pb2.TaskID(value=task_id))
Example #13
0
    def _perform_sync(self):
        """Performs the sync with the database
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        mesos_master = scheduler_mgr.mesos_address
        node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port)

        # Kill running tasks for canceled job executions
        for task_to_kill in job_exe_mgr.sync_with_database():
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task_to_kill.id
            logger.info('Killing task %s', task_to_kill.id)
            self._driver.killTask(pb_task_to_kill)
        
        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Example #14
0
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Kill running tasks for canceled job executions
        for task_to_kill in job_exe_mgr.sync_with_database():
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task_to_kill.id
            logger.info('Killing task %s', task_to_kill.id)
            self._driver.killTask(pb_task_to_kill)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Example #15
0
    def _timeout_tasks(self, when):
        """Handles any tasks that have exceeded their time out thresholds

        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        # Time out tasks that have exceeded thresholds
        for task in task_mgr.get_timeout_tasks(when):
            # Handle task timeout based on the type of the task
            if task.id.startswith(JOB_TASK_ID_PREFIX):
                # Job task, notify job execution manager
                job_exe_mgr.handle_task_timeout(task, when)
            else:
                # Not a job task, so must be a node task
                node_mgr.handle_task_timeout(task)

            # Send kill message for timed out task
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task.id
            logger.info('Killing task %s', task.id)
            self._driver.killTask(pb_task_to_kill)
Example #16
0
 def killBatchJobs(self, jobIDs):
     # FIXME: probably still racy
     assert self.driver is not None
     localSet = set()
     for jobID in jobIDs:
         self.killJobIds.add(jobID)
         localSet.add(jobID)
         self.intendedKill.add(jobID)
         # FIXME: a bit too expensive for my taste
         if jobID in self.getIssuedBatchJobIDs():
             taskId = mesos_pb2.TaskID()
             taskId.value = str(jobID)
             self.driver.killTask(taskId)
         else:
             self.killJobIds.remove(jobID)
             localSet.remove(jobID)
     while localSet:
         intersection = localSet.intersection(self.killedJobIds)
         if intersection:
             localSet -= intersection
             self.killedJobIds -= intersection
         else:
             time.sleep(1)
Example #17
0
def test_executor_event_handlers(mocker):
    executor = mocker.Mock()
    driver = mocker.Mock()
    proxy = ExecutorProxy(executor)

    proxy.registered(driver, mesos_pb2.ExecutorInfo(),
                     mesos_pb2.FrameworkInfo(), mesos_pb2.SlaveInfo())
    proxy.reregistered(driver, mesos_pb2.SlaveInfo())
    proxy.disconnected(driver)
    proxy.launchTask(driver, mesos_pb2.TaskInfo())
    proxy.killTask(driver, mesos_pb2.TaskID())
    proxy.frameworkMessage(driver, 'message')
    proxy.shutdown(driver)
    proxy.error(driver, 'message')

    executor.on_registered.assert_called_once()
    executor.on_reregistered.assert_called_once()
    executor.on_disconnected.assert_called_once()
    executor.on_launch.assert_called_once()
    executor.on_kill.assert_called_once()
    executor.on_message.assert_called_once()
    executor.on_shutdown.assert_called_once()
    executor.on_error.assert_called_once()
Example #18
0
 def kill_task(self, driver, t):
     task_id = mesos_pb2.TaskID()
     task_id.value = "%s-%s" % (t.id, t.tried)
     driver.killTask(task_id)
 def kill_task(self, driver, task):
     tid = mesos_pb2.TaskID()
     tid.value = task
     driver.killTask(tid)
     self.tasks_with_flags[task].mesos_task_state = TASK_KILLING
Example #20
0
def killTask():
  taskID = request.args.get('taskID')
  print taskID
  driver.killTask(mesos_pb2.TaskID(value=taskID))
  return "Success"
Example #21
0
 def deleteTask(self, instance):
     task_id = mesos_pb2.TaskID()
     task_id.value = instance.task_id
     driver.killTask(task_id)
Example #22
0
 def killTask(self, job_id, task_id, tried):
     tid = mesos_pb2.TaskID()
     tid.value = "%s:%s:%s" % (job_id, task_id, tried)
     self.driver.killTask(tid)