Example #1
0
    def testJobQueue(self, testJobs=1000):
        """
        The mesos JobQueue sorts MesosShape objects by requirement and
        this test ensures that that sorting is what is expected:
        non-preemptible jobs groups first, with priority given to large jobs.
        """
        from toil.batchSystems.mesos import JobQueue
        jobQueue = JobQueue()

        for jobNum in range(0, testJobs):
            testJob = self._getJob(cores=random.choice(list(range(10))),
                                   preemptable=random.choice([True, False]))
            jobQueue.insertJob(testJob, testJob.resources)

        sortedTypes = jobQueue.sortedTypes
        self.assertGreaterEqual(20, len(sortedTypes))
        self.assertTrue(
            all(sortedTypes[i] <= sortedTypes[i + 1]
                for i in range(len(sortedTypes) - 1)))

        preemptable = sortedTypes.pop(0).preemptable
        for jtype in sortedTypes:
            # all non preemptable jobTypes must be first in sorted order
            if preemptable:
                # all the rest of the jobTypes must be preemptable as well
                assert jtype.preemptable
            elif jtype.preemptable:
                # we have reached our first preemptable job
                preemptable = jtype.preemptable

        # make sure proper number of jobs are in queue
        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        testJob = self._getJob(cores=random.choice(list(range(10))))
        jobQueue.insertJob(testJob, testJob.resources)
        testJobs += 1

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        tmpJob = None
        while not jobQueue.typeEmpty(testJob.resources):
            testJobs -= 1
            tmpJob = jobQueue.nextJobOfType(testJob.resources)

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)
        # Ensure FIFO
        self.assertIs(testJob, tmpJob)
Example #2
0
    def testJobQueue(self, testJobs=1000):
        from toil.batchSystems.mesos import JobQueue
        jobQueue = JobQueue()

        for jobNum in range(0, testJobs):
            testJob = self._getJob(cores=random.choice(range(10)),
                                   preemptable=random.choice([True, False]))
            jobQueue.insertJob(testJob, testJob.resources)

        sortedTypes = jobQueue.sorted()
        # test this is properly sorted
        self.assertGreaterEqual(20, len(sortedTypes))
        self.assertTrue(
            all(sortedTypes[i] <= sortedTypes[i + 1]
                for i in range(len(sortedTypes) - 1)))

        preemptable = sortedTypes.pop(0).preemptable
        for jtype in sortedTypes:
            # all non preemptable jobTypes must be first in sorted order
            if preemptable:
                # all the rest of the jobTypes must be preemptable as well
                assert jtype.preemptable
            elif jtype.preemptable:
                # we have reached our first preemptable job
                preemptable = jtype.preemptable

        # make sure proper number of jobs are in queue
        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        testJob = self._getJob(cores=random.choice(range(10)))
        jobQueue.insertJob(testJob, testJob.resources)
        testJobs += 1

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        tmpJob = None
        while not jobQueue.typeEmpty(testJob.resources):
            testJobs -= 1
            tmpJob = jobQueue.nextJobOfType(testJob.resources)

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)
        # Ensure FIFO
        self.assertIs(testJob, tmpJob)
    def testJobQueue(self, testJobs=1000):
        from toil.batchSystems.mesos import JobQueue
        jobQueue = JobQueue()

        for jobNum in range(0, testJobs):
            testJob = self._getJob(cores=random.choice(range(10)), preemptable=random.choice([True, False]))
            jobQueue.insertJob(testJob, testJob.resources)

        sortedTypes = jobQueue.sorted()
        # test this is properly sorted
        self.assertGreaterEqual(20, len(sortedTypes))
        self.assertTrue(all(sortedTypes[i] <= sortedTypes[i + 1] for i in range(len(sortedTypes) - 1)))

        preemptable = sortedTypes.pop(0).preemptable
        for jtype in sortedTypes:
            # all non preemptable jobTypes must be first in sorted order
            if preemptable:
                # all the rest of the jobTypes must be preemptable as well
                assert jtype.preemptable
            elif jtype.preemptable:
                # we have reached our first preemptable job
                preemptable = jtype.preemptable

        # make sure proper number of jobs are in queue
        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        testJob = self._getJob(cores=random.choice(range(10)))
        jobQueue.insertJob(testJob, testJob.resources)
        testJobs += 1

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)

        tmpJob = None
        while not jobQueue.typeEmpty(testJob.resources):
            testJobs -= 1
            tmpJob = jobQueue.nextJobOfType(testJob.resources)

        self.assertEqual(len(jobQueue.jobIDs()), testJobs)
        # Ensure FIFO
        self.assertIs(testJob, tmpJob)
Example #4
0
class MesosBatchSystem(BatchSystemSupport,
                       AbstractScalableBatchSystem,
                       mesos.interface.Scheduler):
    """
    A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos
    tasks over a cluster of slave nodes. A Mesos framework consists of a scheduler and an
    executor. This class acts as the scheduler and is typically run on the master node that also
    runs the Mesos master process with which the scheduler communicates via a driver component.
    The executor is implemented in a separate class. It is run on each slave node and
    communicates with the Mesos slave process via another driver object. The scheduler may also
    be run on a separate node from the master, which we then call somewhat ambiguously the driver
    node.
    """

    @classmethod
    def supportsHotDeployment(cls):
        return True

    @classmethod
    def supportsWorkerCleanup(cls):
        return True

    class ExecutorInfo(object):
        def __init__(self, nodeAddress, slaveId, nodeInfo, lastSeen):
            super(MesosBatchSystem.ExecutorInfo, self).__init__()
            self.nodeAddress = nodeAddress
            self.slaveId = slaveId
            self.nodeInfo = nodeInfo
            self.lastSeen = lastSeen

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)

        # The hot-deployed resource representing the user script. Will be passed along in every
        # Mesos task. Also see setUserScript().
        self.userScript = None
        """
        :type: toil.resource.Resource
        """

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueues = JobQueue()

        # Address of the Mesos master in the form host:port where host can be an IP or a hostname
        self.mesosMasterAddress = config.mesosMasterAddress

        # Written to when Mesos kills tasks, as directed by Toil
        self.killedJobIds = set()

        # The IDs of job to be killed
        self.killJobIds = set()

        # Contains jobs on which killBatchJobs were called, regardless of whether or not they
        # actually were killed or ended by themselves
        self.intendedKill = set()

        # Map of host address to job ids
        # this is somewhat redundant since Mesos returns the number of workers per
        # node. However, that information isn't guaranteed to reach the leader,
        # so we also track the state here. When the information is returned from
        # mesos, prefer that information over this attempt at state tracking.
        self.hostToJobIDs = {}

        # see self.setNodeFilter
        self.nodeFilter = []

        # Dict of launched jobIDs to TaskData objects
        self.runningJobMap = {}

        # Mesos has no easy way of getting a task's resources so we track them here
        self.taskResources = {}

        # Queue of jobs whose status has been updated, according to Mesos
        self.updatedJobsQueue = Queue()

        # The Mesos driver used by this scheduler
        self.driver = None

        # A dictionary mapping a node's IP to an ExecutorInfo object describing important
        # properties of our executor running on that node. Only an approximation of the truth.
        self.executors = {}

        # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an
        #  approximation of the truth. Recently launched nodes may be absent from this set for a
        # while and a node's absence from this set does not imply its preemptability. But it is
        # generally safer to assume a node is preemptable since non-preemptability is a stronger
        # requirement. If we tracked the set of preemptable nodes instead, we'd have to use
        # absence as an indicator of non-preemptability and could therefore be misled into
        # believeing that a recently launched preemptable node was non-preemptable.
        self.nonPreemptableNodes = set()

        self.executor = self._buildExecutor()

        self.unusedJobID = itertools.count()
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # These control how frequently to log a message that would indicate if no jobs are
        # currently able to run on the offers given. This can happen if the cluster is busy
        # or if the nodes in the cluster simply don't have enough resources to run the jobs
        self.lastTimeOfferLogged = 0
        self.logPeriod = 30  # seconds

        self.ignoredNodes = set()

        self._startDriver()

    def setUserScript(self, userScript):
        self.userScript = userScript

    def ignoreNode(self, nodeAddress):
        self.ignoredNodes.add(nodeAddress)

    def unignoreNode(self, nodeAddress):
        self.ignoredNodes.remove(nodeAddress)

    def issueBatchJob(self, jobNode):
        """
        Issues the following command returning a unique jobID. Command is the string to run, memory
        is an int giving the number of bytes the job needs to run in and cores is the number of cpus
        needed for the job and error-file is the path of the file to place any std-err/std-out in.
        """
        self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk)
        jobID = next(self.unusedJobID)
        job = ToilJob(jobID=jobID,
                      name=str(jobNode),
                      resources=ResourceRequirement(**jobNode._requirements),
                      command=jobNode.command,
                      userScript=self.userScript,
                      environment=self.environment.copy(),
                      workerCleanupInfo=self.workerCleanupInfo)
        jobType = job.resources
        log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))

        # TODO: round all elements of resources

        self.jobQueues.insertJob(job, jobType)
        self.taskResources[jobID] = job.resources
        log.debug("... queued")
        return jobID

    def killBatchJobs(self, jobIDs):
        # FIXME: probably still racy
        assert self.driver is not None
        localSet = set()
        for jobID in jobIDs:
            self.killJobIds.add(jobID)
            localSet.add(jobID)
            self.intendedKill.add(jobID)
            # FIXME: a bit too expensive for my taste
            if jobID in self.getIssuedBatchJobIDs():
                taskId = mesos_pb2.TaskID()
                taskId.value = str(jobID)
                self.driver.killTask(taskId)
            else:
                self.killJobIds.remove(jobID)
                localSet.remove(jobID)
        while localSet:
            intersection = localSet.intersection(self.killedJobIds)
            if intersection:
                localSet -= intersection
                self.killedJobIds -= intersection
            else:
                time.sleep(1)

    def getIssuedBatchJobIDs(self):
        jobIds = set(self.jobQueues.jobIDs())
        jobIds.update(list(self.runningJobMap.keys()))
        return list(jobIds)

    def getRunningBatchJobIDs(self):
        currentTime = dict()
        for jobID, data in list(self.runningJobMap.items()):
            currentTime[jobID] = time.time() - data.startTime
        return currentTime

    def getUpdatedBatchJob(self, maxWait):
        while True:
            try:
                item = self.updatedJobsQueue.get(timeout=maxWait)
            except Empty:
                return None
            jobId, exitValue, wallTime = item
            try:
                self.intendedKill.remove(jobId)
            except KeyError:
                log.debug('Job %s ended with status %i, took %s seconds.', jobId, exitValue,
                          '???' if wallTime is None else str(wallTime))
                return item
            else:
                log.debug('Job %s ended naturally before it could be killed.', jobId)

    def nodeInUse(self, nodeIP):
        return nodeIP in self.hostToJobIDs

    @contextmanager
    def nodeFiltering(self, filter):
        self.nodeFilter = [filter]
        yield
        self.nodeFilter = []

    def getWaitDuration(self):
        """
        Gets the period of time to wait (floating point, in seconds) between checking for
        missing/overlong jobs.
        """
        return self.reconciliationPeriod

    @classmethod
    def getRescueBatchJobFrequency(cls):
        return 30 * 60  # Half an hour

    def _buildExecutor(self):
        """
        Creates and returns an ExecutorInfo instance representing our executor implementation.
        """
        # The executor program is installed as a setuptools entry point by setup.py
        info = mesos_pb2.ExecutorInfo()
        info.name = "toil"
        info.command.value = resolveEntryPoint('_toil_mesos_executor')
        info.executor_id.value = "toil-%i" % os.getpid()
        info.source = pwd.getpwuid(os.getuid()).pw_name
        return info

    def _startDriver(self):
        """
        The Mesos driver thread which handles the scheduler's communication with the Mesos master
        """
        framework = mesos_pb2.FrameworkInfo()
        framework.user = ""  # Have Mesos fill in the current user.
        framework.name = "toil"
        framework.principal = framework.name
        self.driver = mesos.native.MesosSchedulerDriver(self,
                                                        framework,
                                                        self._resolveAddress(self.mesosMasterAddress),
                                                        True)  # enable implicit acknowledgements
        assert self.driver.start() == mesos_pb2.DRIVER_RUNNING

    @staticmethod
    def _resolveAddress(address):
        """
        Resolves the host in the given string. The input is of the form host[:port]. This method
        is idempotent, i.e. the host may already be a dotted IP address.

        >>> # noinspection PyProtectedMember
        >>> f=MesosBatchSystem._resolveAddress
        >>> f('localhost')
        '127.0.0.1'
        >>> f('127.0.0.1')
        '127.0.0.1'
        >>> f('localhost:123')
        '127.0.0.1:123'
        >>> f('127.0.0.1:123')
        '127.0.0.1:123'
        """
        address = address.split(':')
        assert len(address) in (1, 2)
        address[0] = socket.gethostbyname(address[0])
        return ':'.join(address)

    def shutdown(self):
        log.debug("Stopping Mesos driver")
        self.driver.stop()
        log.debug("Joining Mesos driver")
        driver_result = self.driver.join()
        log.debug("Joined Mesos driver")
        if driver_result != mesos_pb2.DRIVER_STOPPED:
            raise RuntimeError("Mesos driver failed with %i", driver_result)

    def registered(self, driver, frameworkId, masterInfo):
        """
        Invoked when the scheduler successfully registers with a Mesos master
        """
        log.debug("Registered with framework ID %s", frameworkId.value)

    def _declineAllOffers(self, driver, offers):
        for offer in offers:
            log.debug("Declining offer %s.", offer.id.value)
            driver.declineOffer(offer.id)

    def _parseOffer(self, offer):
        cores = 0
        memory = 0
        disk = 0
        preemptable = None
        for attribute in offer.attributes:
            if attribute.name == 'preemptable':
                assert preemptable is None, "Attribute 'preemptable' occurs more than once."
                preemptable = strict_bool(attribute.text.value)
        if preemptable is None:
            log.debug('Slave not marked as either preemptable or not. Assuming non-preemptable.')
            preemptable = False
        for resource in offer.resources:
            if resource.name == "cpus":
                cores += resource.scalar.value
            elif resource.name == "mem":
                memory += resource.scalar.value
            elif resource.name == "disk":
                disk += resource.scalar.value
        return cores, memory, disk, preemptable

    def _prepareToRun(self, jobType, offer):
        # Get the first element to insure FIFO
        job = self.jobQueues.nextJobOfType(jobType)
        task = self._newMesosTask(job, offer)
        return task

    def _updateStateToRunning(self, offer, runnableTasks):
        for task in runnableTasks:
            resourceKey = int(task.task_id.value)
            resources = self.taskResources[resourceKey]
            slaveIP = socket.gethostbyname(offer.hostname)
            try:
                self.hostToJobIDs[slaveIP].append(resourceKey)
            except KeyError:
                self.hostToJobIDs[slaveIP] = [resourceKey]

            self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(),
                                                                   slaveID=offer.slave_id.value,
                                                                   slaveIP=slaveIP,
                                                                   executorID=task.executor.executor_id.value,
                                                                   cores=resources.cores,
                                                                   memory=resources.memory)
            del self.taskResources[resourceKey]
            log.debug('Launched Mesos task %s.', task.task_id.value)

    def resourceOffers(self, driver, offers):
        """
        Invoked when resources have been offered to this framework.
        """
        self._trackOfferedNodes(offers)

        jobTypes = self.jobQueues.sorted()

        # TODO: We may want to assert that numIssued >= numRunning
        if not jobTypes or len(self.getIssuedBatchJobIDs()) == len(self.getRunningBatchJobIDs()):
            log.debug('There are no queued tasks. Declining Mesos offers.')
            # Without jobs, we can get stuck with no jobs and no new offers until we decline it.
            self._declineAllOffers(driver, offers)
            return

        unableToRun = True
        # Right now, gives priority to largest jobs
        for offer in offers:
            if offer.hostname in self.ignoredNodes:
                log.debug("Declining offer %s because node %s is designated for termination" %
                        (offer.id.value, offer.hostname))
                driver.declineOffer(offer.id)
                continue
            runnableTasks = []
            # TODO: In an offer, can there ever be more than one resource with the same name?
            offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer)
            log.debug('Got offer %s for a %spreemptable slave with %.2f MiB memory, %.2f core(s) '
                      'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-',
                      offerMemory, offerCores, offerDisk)
            remainingCores = offerCores
            remainingMemory = offerMemory
            remainingDisk = offerDisk

            for jobType in jobTypes:
                runnableTasksOfType = []
                # Because we are not removing from the list until outside of the while loop, we
                # must decrement the number of jobs left to run ourselves to avoid an infinite
                # loop.
                nextToLaunchIndex = 0
                # Toil specifies disk and memory in bytes but Mesos uses MiB
                while ( not self.jobQueues.typeEmpty(jobType)
                       # On a non-preemptable node we can run any job, on a preemptable node we
                       # can only run preemptable jobs:
                       and (not offerPreemptable or jobType.preemptable)
                       and remainingCores >= jobType.cores
                       and remainingDisk >= toMiB(jobType.disk)
                       and remainingMemory >= toMiB(jobType.memory)):
                    task = self._prepareToRun(jobType, offer)
                    # TODO: this used to be a conditional but Hannes wanted it changed to an assert
                    # TODO: ... so we can understand why it exists.
                    assert int(task.task_id.value) not in self.runningJobMap
                    runnableTasksOfType.append(task)
                    log.debug("Preparing to launch Mesos task %s using offer %s ...",
                              task.task_id.value, offer.id.value)
                    remainingCores -= jobType.cores
                    remainingMemory -= toMiB(jobType.memory)
                    remainingDisk -= toMiB(jobType.disk)
                    nextToLaunchIndex += 1
                else:
                    log.debug('Offer %(offer)s not suitable to run the tasks with requirements '
                              '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores '
                              'and %(disk)s of disk on a %(non)spreemptable slave.',
                              dict(offer=offer.id.value,
                                   requirements=jobType.__dict__,
                                   non='' if offerPreemptable else 'non-',
                                   memory=fromMiB(offerMemory),
                                   cores=offerCores,
                                   disk=fromMiB(offerDisk)))
                runnableTasks.extend(runnableTasksOfType)
            # Launch all runnable tasks together so we only call launchTasks once per offer
            if runnableTasks:
                unableToRun = False
                driver.launchTasks(offer.id, runnableTasks)
                self._updateStateToRunning(offer, runnableTasks)
            else:
                log.debug('Although there are queued jobs, none of them could be run with offer %s '
                          'extended to the framework.', offer.id)
                driver.declineOffer(offer.id)

        if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod):
            self.lastTimeOfferLogged = time.time()
            log.debug('Although there are queued jobs, none of them were able to run in '
                     'any of the offers extended to the framework. There are currently '
                     '%i jobs running. Enable debug level logging to see more details about '
                     'job types and offers received.', len(self.runningJobMap))

    def _trackOfferedNodes(self, offers):
        for offer in offers:
            nodeAddress = socket.gethostbyname(offer.hostname)
            self._registerNode(nodeAddress, offer.slave_id.value)
            preemptable = False
            for attribute in offer.attributes:
                if attribute.name == 'preemptable':
                    preemptable = strict_bool(attribute.text.value)
            if preemptable:
                try:
                    self.nonPreemptableNodes.remove(offer.slave_id.value)
                except KeyError:
                    pass
            else:
                self.nonPreemptableNodes.add(offer.slave_id.value)

    def _filterOfferedNodes(self, offers):
        if not self.nodeFilter:
            return offers
        executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers]
        executorInfos = [_f for _f in executorInfoOrNone if _f]
        executorsToConsider = list(filter(self.nodeFilter[0], executorInfos))
        ipsToConsider = {ex.nodeAddress for ex in executorsToConsider}
        return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider]

    def _newMesosTask(self, job, offer):
        """
        Build the Mesos task object for a given the Toil job and Mesos offer
        """
        task = mesos_pb2.TaskInfo()
        task.task_id.value = str(job.jobID)
        task.slave_id.value = offer.slave_id.value
        task.name = job.name
        task.data = pickle.dumps(job)
        task.executor.MergeFrom(self.executor)

        cpus = task.resources.add()
        cpus.name = "cpus"
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = job.resources.cores

        disk = task.resources.add()
        disk.name = "disk"
        disk.type = mesos_pb2.Value.SCALAR
        if toMiB(job.resources.disk) > 1:
            disk.scalar.value = toMiB(job.resources.disk)
        else:
            log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.disk)
            disk.scalar.value = 1
        mem = task.resources.add()
        mem.name = "mem"
        mem.type = mesos_pb2.Value.SCALAR
        if toMiB(job.resources.memory) > 1:
            mem.scalar.value = toMiB(job.resources.memory)
        else:
            log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.memory)
            mem.scalar.value = 1
        return task

    def statusUpdate(self, driver, update):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost and so the task is
        lost, a task finishes and an executor sends a status update saying so, etc). Note that
        returning from this callback _acknowledges_ receipt of this status update! If for
        whatever reason the scheduler aborts during this callback (or the process exits) another
        status update will be delivered (note, however, that this is currently not true if the
        slave sending the status update is lost/fails during that time).
        """
        jobID = int(update.task_id.value)
        stateName = mesos_pb2.TaskState.Name(update.state)
        log.debug("Job %i is in state '%s'.", jobID, stateName)

        def jobEnded(_exitStatus, wallTime=None):
            try:
                self.killJobIds.remove(jobID)
            except KeyError:
                pass
            else:
                self.killedJobIds.add(jobID)
            self.updatedJobsQueue.put((jobID, _exitStatus, wallTime))
            slaveIP = None
            try:
                slaveIP = self.runningJobMap[jobID].slaveIP
            except KeyError:
                log.warning("Job %i returned exit code %i but isn't tracked as running.",
                            jobID, _exitStatus)
            else:
                del self.runningJobMap[jobID]

            try:
                self.hostToJobIDs[slaveIP].remove(jobID)
            except KeyError:
                log.warning("Job %i returned exit code %i from unknown host.",
                            jobID, _exitStatus)

        if update.state == mesos_pb2.TASK_FINISHED:
            jobEnded(0, wallTime=unpack('d', update.data)[0])
        elif update.state == mesos_pb2.TASK_FAILED:
            try:
                exitStatus = int(update.message)
            except ValueError:
                exitStatus = 255
                log.warning("Job %i failed with message '%s'", jobID, update.message)
            else:
                log.warning('Job %i failed with exit status %i', jobID, exitStatus)
            jobEnded(exitStatus)
        elif update.state in (mesos_pb2.TASK_LOST, mesos_pb2.TASK_KILLED, mesos_pb2.TASK_ERROR):
            log.warning("Job %i is in unexpected state %s with message '%s'.",
                        jobID, stateName, update.message)
            jobEnded(255)

    def frameworkMessage(self, driver, executorId, slaveId, message):
        """
        Invoked when an executor sends a message.
        """
        log.debug('Got framework message from executor %s running on slave %s: %s',
                  executorId.value, slaveId.value, message)
        message = ast.literal_eval(message)
        assert isinstance(message, dict)
        # Handle the mandatory fields of a message
        nodeAddress = message.pop('address')
        executor = self._registerNode(nodeAddress, slaveId.value)
        # Handle optional message fields
        for k, v in iteritems(message):
            if k == 'nodeInfo':
                assert isinstance(v, dict)
                resources = [taskData for taskData in itervalues(self.runningJobMap)
                             if taskData.executorID == executorId.value]
                requestedCores = sum(taskData.cores for taskData in resources)
                requestedMemory = sum(taskData.memory for taskData in resources)
                executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v)
                self.executors[nodeAddress] = executor
            else:
                raise RuntimeError("Unknown message field '%s'." % k)

    def _registerNode(self, nodeAddress, slaveId):
        executor = self.executors.get(nodeAddress)
        if executor is None or executor.slaveId != slaveId:
            executor = self.ExecutorInfo(nodeAddress=nodeAddress,
                                         slaveId=slaveId,
                                         nodeInfo=None,
                                         lastSeen=time.time())
            self.executors[nodeAddress] = executor
        else:
            executor.lastSeen = time.time()
        return executor

    def getNodes(self, preemptable=None, timeout=600):
        timeout = timeout or sys.maxsize
        return {nodeAddress: executor.nodeInfo
                for nodeAddress, executor in iteritems(self.executors)
                if time.time() - executor.lastSeen < timeout
                and (preemptable is None
                     or preemptable == (executor.slaveId not in self.nonPreemptableNodes))}

    def reregistered(self, driver, masterInfo):
        """
        Invoked when the scheduler re-registers with a newly elected Mesos master.
        """
        log.debug('Registered with new master')

    def executorLost(self, driver, executorId, slaveId, status):
        """
        Invoked when an executor has exited/terminated.
        """
        log.warning("Executor '%s' lost.", executorId)


    @classmethod
    def setOptions(cl, setOption):
        setOption("mesosMasterAddress", None, None, 'localhost:5050')
Example #5
0
class MesosBatchSystem(BatchSystemLocalSupport,
                       AbstractScalableBatchSystem,
                       Scheduler):
    """
    A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos
    tasks over a cluster of agent nodes. A Mesos framework consists of a scheduler and an
    executor. This class acts as the scheduler and is typically run on the master node that also
    runs the Mesos master process with which the scheduler communicates via a driver component.
    The executor is implemented in a separate class. It is run on each agent node and
    communicates with the Mesos agent process via another driver object. The scheduler may also
    be run on a separate node from the master, which we then call somewhat ambiguously the driver
    node.
    """

    @classmethod
    def supportsAutoDeployment(cls):
        return True

    @classmethod
    def supportsWorkerCleanup(cls):
        return True

    class ExecutorInfo:
        def __init__(self, nodeAddress, agentId, nodeInfo, lastSeen):
            super(MesosBatchSystem.ExecutorInfo, self).__init__()
            self.nodeAddress = nodeAddress
            self.agentId = agentId
            self.nodeInfo = nodeInfo
            self.lastSeen = lastSeen

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super().__init__(config, maxCores, maxMemory, maxDisk)

        # The auto-deployed resource representing the user script. Will be passed along in every
        # Mesos task. Also see setUserScript().
        self.userScript = None
        """
        :type: toil.resource.Resource
        """

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueues = JobQueue()

        # Address of the Mesos master in the form host:port where host can be an IP or a hostname
        self.mesos_endpoint = config.mesos_endpoint

        # Written to when Mesos kills tasks, as directed by Toil.
        # Jobs must not enter this set until they are removed from runningJobMap.
        self.killedJobIds = set()

        # The IDs of job to be killed
        self.killJobIds = set()

        # Contains jobs on which killBatchJobs were called, regardless of whether or not they
        # actually were killed or ended by themselves
        self.intendedKill = set()

        # Map of host address to job ids
        # this is somewhat redundant since Mesos returns the number of workers per
        # node. However, that information isn't guaranteed to reach the leader,
        # so we also track the state here. When the information is returned from
        # mesos, prefer that information over this attempt at state tracking.
        self.hostToJobIDs = {}

        # see self.setNodeFilter
        self.nodeFilter = []

        # Dict of launched jobIDs to TaskData objects
        self.runningJobMap = {}

        # Mesos has no easy way of getting a task's resources so we track them here
        self.taskResources = {}

        # Queue of jobs whose status has been updated, according to Mesos
        self.updatedJobsQueue = Queue()

        # The Mesos driver used by this scheduler
        self.driver = None

        # The string framework ID that we are assigned when registering with the Mesos master
        self.frameworkId = None

        # A dictionary mapping a node's IP to an ExecutorInfo object describing important
        # properties of our executor running on that node. Only an approximation of the truth.
        self.executors = {}

        # A dictionary mapping back from agent ID to the last observed IP address of its node.
        self.agentsByID = {}

        # A set of Mesos agent IDs, one for each agent running on a
        # non-preemptable node. Only an approximation of the truth. Recently
        # launched nodes may be absent from this set for a while and a node's
        # absence from this set does not imply its preemptability. But it is
        # generally safer to assume a node is preemptable since
        # non-preemptability is a stronger requirement. If we tracked the set
        # of preemptable nodes instead, we'd have to use absence as an
        # indicator of non-preemptability and could therefore be misled into
        # believing that a recently launched preemptable node was
        # non-preemptable.
        self.nonPreemptableNodes = set()

        self.executor = self._buildExecutor()

        # These control how frequently to log a message that would indicate if no jobs are
        # currently able to run on the offers given. This can happen if the cluster is busy
        # or if the nodes in the cluster simply don't have enough resources to run the jobs
        self.lastTimeOfferLogged = 0
        self.logPeriod = 30  # seconds

        self.ignoredNodes = set()

        self._startDriver()

    def setUserScript(self, userScript):
        self.userScript = userScript

    def ignoreNode(self, nodeAddress):
        self.ignoredNodes.add(nodeAddress)

    def unignoreNode(self, nodeAddress):
        self.ignoredNodes.remove(nodeAddress)

    def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
        """
        Issues the following command returning a unique jobID. Command is the string to run, memory
        is an int giving the number of bytes the job needs to run in and cores is the number of cpus
        needed for the job and error-file is the path of the file to place any std-err/std-out in.
        """
        localID = self.handleLocalJob(jobNode)
        if localID:
            return localID

        mesos_resources = {
            "memory": jobNode.memory,
            "cores": jobNode.cores,
            "disk": jobNode.disk,
            "preemptable": jobNode.preemptable
        }
        self.checkResourceRequest(
            memory=mesos_resources["memory"],
            cores=mesos_resources["cores"],
            disk=mesos_resources["disk"]
        )

        jobID = self.getNextJobID()
        environment = self.environment.copy()
        if job_environment:
            environment.update(job_environment)

        job = ToilJob(jobID=jobID,
                      name=str(jobNode),
                      resources=MesosShape(wallTime=0, **mesos_resources),
                      command=jobNode.command,
                      userScript=self.userScript,
                      environment=environment,
                      workerCleanupInfo=self.workerCleanupInfo)
        jobType = job.resources
        log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))

        # TODO: round all elements of resources

        self.taskResources[jobID] = job.resources
        self.jobQueues.insertJob(job, jobType)
        log.debug("... queued")
        return jobID

    def killBatchJobs(self, jobIDs):

        # Some jobs may be local. Kill them first.
        self.killLocalJobs(jobIDs)

        # The driver thread does the actual work of killing the remote jobs.
        # We have to give it instructions, and block until the jobs are killed.
        assert self.driver is not None

        # This is the set of jobs that this invocation has asked to be killed,
        # but which haven't been killed yet.
        localSet = set()

        for jobID in jobIDs:
            # Queue the job up to be killed
            self.killJobIds.add(jobID)
            localSet.add(jobID)
            # Record that we meant to kill it, in case it finishes up by itself.
            self.intendedKill.add(jobID)

            if jobID in self.getIssuedBatchJobIDs():
                # Since the job has been issued, we have to kill it
                taskId = addict.Dict()
                taskId.value = str(jobID)
                log.debug("Kill issued job %s" % str(jobID))
                self.driver.killTask(taskId)
            else:
                # This job was never issued. Maybe it is a local job.
                # We don't have to kill it.
                log.debug("Skip non-issued job %s" % str(jobID))
                self.killJobIds.remove(jobID)
                localSet.remove(jobID)
        # Now localSet just has the non-local/issued jobs that we asked to kill
        while localSet:
            # Wait until they are all dead
            intersection = localSet.intersection(self.killedJobIds)
            if intersection:
                localSet -= intersection
                # When jobs are killed that we asked for, clear them out of
                # killedJobIds where the other thread put them
                self.killedJobIds -= intersection
            else:
                time.sleep(1)
        # Now all the jobs we asked to kill are dead. We know they are no
        # longer running, because that update happens before their IDs go into
        # killedJobIds. So we can safely return.

    def getIssuedBatchJobIDs(self):
        jobIds = set(self.jobQueues.jobIDs())
        jobIds.update(list(self.runningJobMap.keys()))
        return list(jobIds) + list(self.getIssuedLocalJobIDs())

    def getRunningBatchJobIDs(self):
        currentTime = dict()
        for jobID, data in list(self.runningJobMap.items()):
            currentTime[jobID] = time.time() - data.startTime
        currentTime.update(self.getRunningLocalJobIDs())
        return currentTime

    def getUpdatedBatchJob(self, maxWait):
        local_tuple = self.getUpdatedLocalJob(0)
        if local_tuple:
            return local_tuple
        while True:
            try:
                item = self.updatedJobsQueue.get(timeout=maxWait)
            except Empty:
                return None
            try:
                self.intendedKill.remove(item.jobID)
            except KeyError:
                log.debug('Job %s ended with status %i, took %s seconds.', item.jobID, item.exitStatus,
                          '???' if item.wallTime is None else str(item.wallTime))
                return item
            else:
                log.debug('Job %s ended naturally before it could be killed.', item.jobID)

    def nodeInUse(self, nodeIP: str) -> bool:
        return nodeIP in self.hostToJobIDs

    def getWaitDuration(self):
        """
        Gets the period of time to wait (floating point, in seconds) between checking for
        missing/overlong jobs.
        """
        return 1

    def _buildExecutor(self):
        """
        Creates and returns an ExecutorInfo-shaped object representing our executor implementation.
        """
        # The executor program is installed as a setuptools entry point by setup.py
        info = addict.Dict()
        info.name = "toil"
        info.command.value = resolveEntryPoint('_toil_mesos_executor')
        info.executor_id.value = "toil-%i" % os.getpid()
        info.source = pwd.getpwuid(os.getuid()).pw_name
        return info

    def _startDriver(self):
        """
        The Mesos driver thread which handles the scheduler's communication with the Mesos master
        """
        framework = addict.Dict()
        framework.user = getpass.getuser()  # We must determine the user name ourselves with pymesos
        framework.name = "toil"
        framework.principal = framework.name
        # Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts.
        # Make sure it will call us with nice namespace-y addicts
        self.driver = MesosSchedulerDriver(self, framework,
                                           self._resolveAddress(self.mesos_endpoint),
                                           use_addict=True, implicit_acknowledgements=True)
        self.driver.start()

    @staticmethod
    def _resolveAddress(address):
        """
        Resolves the host in the given string. The input is of the form host[:port]. This method
        is idempotent, i.e. the host may already be a dotted IP address.

        >>> # noinspection PyProtectedMember
        >>> f=MesosBatchSystem._resolveAddress
        >>> f('localhost')
        '127.0.0.1'
        >>> f('127.0.0.1')
        '127.0.0.1'
        >>> f('localhost:123')
        '127.0.0.1:123'
        >>> f('127.0.0.1:123')
        '127.0.0.1:123'
        """
        address = address.split(':')
        assert len(address) in (1, 2)
        address[0] = socket.gethostbyname(address[0])
        return ':'.join(address)

    def shutdown(self) -> None:
        self.shutdownLocal()
        log.debug("Stopping Mesos driver")
        self.driver.stop()
        log.debug("Joining Mesos driver")
        driver_result = self.driver.join()
        log.debug("Joined Mesos driver")
        if driver_result is not None and driver_result != 'DRIVER_STOPPED':
            # TODO: The docs say join should return a code, but it keeps returning
            # None when apparently successful. So tolerate that here too.
            raise RuntimeError("Mesos driver failed with %s" % driver_result)

    def registered(self, driver, frameworkId, masterInfo):
        """
        Invoked when the scheduler successfully registers with a Mesos master
        """
        log.debug("Registered with framework ID %s", frameworkId.value)
        # Save the framework ID
        self.frameworkId = frameworkId.value

    def _declineAllOffers(self, driver, offers):
        for offer in offers:
            driver.declineOffer(offer.id)

    def _parseOffer(self, offer):
        cores = 0
        memory = 0
        disk = 0
        preemptable = None
        for attribute in offer.attributes:
            if attribute.name == 'preemptable':
                assert preemptable is None, "Attribute 'preemptable' occurs more than once."
                preemptable = strict_bool(attribute.text.value)
        if preemptable is None:
            log.debug('Agent not marked as either preemptable or not. Assuming non-preemptable.')
            preemptable = False
        for resource in offer.resources:
            if resource.name == "cpus":
                cores += resource.scalar.value
            elif resource.name == "mem":
                memory += resource.scalar.value
            elif resource.name == "disk":
                disk += resource.scalar.value
        return cores, memory, disk, preemptable

    def _prepareToRun(self, jobType, offer):
        # Get the first element to ensure FIFO
        job = self.jobQueues.nextJobOfType(jobType)
        task = self._newMesosTask(job, offer)
        return task

    def _updateStateToRunning(self, offer, runnableTasks):
        for task in runnableTasks:
            resourceKey = int(task.task_id.value)
            resources = self.taskResources[resourceKey]
            agentIP = socket.gethostbyname(offer.hostname)
            try:
                self.hostToJobIDs[agentIP].append(resourceKey)
            except KeyError:
                self.hostToJobIDs[agentIP] = [resourceKey]

            self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(),
                                                                   agentID=offer.agent_id.value,
                                                                   agentIP=agentIP,
                                                                   executorID=task.executor.executor_id.value,
                                                                   cores=resources.cores,
                                                                   memory=resources.memory)
            del self.taskResources[resourceKey]
            log.debug('Launched Mesos task %s.', task.task_id.value)

    def resourceOffers(self, driver, offers):
        """
        Invoked when resources have been offered to this framework.
        """
        self._trackOfferedNodes(offers)

        jobTypes = self.jobQueues.sortedTypes

        if not jobTypes:
            # Without jobs, we can get stuck with no jobs and no new offers until we decline it.
            self._declineAllOffers(driver, offers)
            return

        unableToRun = True
        # Right now, gives priority to largest jobs
        for offer in offers:
            if offer.hostname in self.ignoredNodes:
                driver.declineOffer(offer.id)
                continue
            runnableTasks = []
            # TODO: In an offer, can there ever be more than one resource with the same name?
            offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer)
            log.debug('Got offer %s for a %spreemptable agent with %.2f MiB memory, %.2f core(s) '
                      'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-',
                      offerMemory, offerCores, offerDisk)
            remainingCores = offerCores
            remainingMemory = offerMemory
            remainingDisk = offerDisk

            for jobType in jobTypes:
                runnableTasksOfType = []
                # Because we are not removing from the list until outside of the while loop, we
                # must decrement the number of jobs left to run ourselves to avoid an infinite
                # loop.
                nextToLaunchIndex = 0
                # Toil specifies disk and memory in bytes but Mesos uses MiB
                while ( not self.jobQueues.typeEmpty(jobType)
                       # On a non-preemptable node we can run any job, on a preemptable node we
                       # can only run preemptable jobs:
                       and (not offerPreemptable or jobType.preemptable)
                       and remainingCores >= jobType.cores
                       and remainingDisk >= b_to_mib(jobType.disk)
                       and remainingMemory >= b_to_mib(jobType.memory)):
                    task = self._prepareToRun(jobType, offer)
                    # TODO: this used to be a conditional but Hannes wanted it changed to an assert
                    # TODO: ... so we can understand why it exists.
                    assert int(task.task_id.value) not in self.runningJobMap
                    runnableTasksOfType.append(task)
                    log.debug("Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...",
                              task.task_id.value, jobType.cores, b_to_mib(jobType.memory), b_to_mib(jobType.disk), offer.id.value)
                    remainingCores -= jobType.cores
                    remainingMemory -= b_to_mib(jobType.memory)
                    remainingDisk -= b_to_mib(jobType.disk)
                    nextToLaunchIndex += 1
                if not self.jobQueues.typeEmpty(jobType):
                    # report that remaining jobs cannot be run with the current resourcesq:
                    log.debug('Offer %(offer)s not suitable to run the tasks with requirements '
                              '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores '
                              'and %(disk)s of disk on a %(non)spreemptable agent.',
                              dict(offer=offer.id.value,
                                   requirements=jobType.__dict__,
                                   non='' if offerPreemptable else 'non-',
                                   memory=mib_to_b(offerMemory),
                                   cores=offerCores,
                                   disk=mib_to_b(offerDisk)))
                runnableTasks.extend(runnableTasksOfType)
            # Launch all runnable tasks together so we only call launchTasks once per offer
            if runnableTasks:
                unableToRun = False
                driver.launchTasks(offer.id, runnableTasks)
                self._updateStateToRunning(offer, runnableTasks)
            else:
                log.debug('Although there are queued jobs, none of them could be run with offer %s '
                          'extended to the framework.', offer.id)
                driver.declineOffer(offer.id)

        if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod):
            self.lastTimeOfferLogged = time.time()
            log.debug('Although there are queued jobs, none of them were able to run in '
                     'any of the offers extended to the framework. There are currently '
                     '%i jobs running. Enable debug level logging to see more details about '
                     'job types and offers received.', len(self.runningJobMap))

    def _trackOfferedNodes(self, offers):
        for offer in offers:
            # All AgentID messages are required to have a value according to the Mesos Protobuf file.
            assert 'value' in offer.agent_id
            try:
                nodeAddress = socket.gethostbyname(offer.hostname)
            except:
                log.debug("Failed to resolve hostname %s" % offer.hostname)
                raise
            self._registerNode(nodeAddress, offer.agent_id.value)
            preemptable = False
            for attribute in offer.attributes:
                if attribute.name == 'preemptable':
                    preemptable = strict_bool(attribute.text.value)
            if preemptable:
                try:
                    self.nonPreemptableNodes.remove(offer.agent_id.value)
                except KeyError:
                    pass
            else:
                self.nonPreemptableNodes.add(offer.agent_id.value)

    def _filterOfferedNodes(self, offers):
        if not self.nodeFilter:
            return offers
        executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers]
        executorInfos = [_f for _f in executorInfoOrNone if _f]
        executorsToConsider = list(filter(self.nodeFilter[0], executorInfos))
        ipsToConsider = {ex.nodeAddress for ex in executorsToConsider}
        return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider]

    def _newMesosTask(self, job, offer):
        """
        Build the Mesos task object for a given the Toil job and Mesos offer
        """
        task = addict.Dict()
        task.task_id.value = str(job.jobID)
        task.agent_id.value = offer.agent_id.value
        task.name = job.name
        task.data = encode_data(pickle.dumps(job))
        task.executor = addict.Dict(self.executor)

        task.resources = []

        task.resources.append(addict.Dict())
        cpus = task.resources[-1]
        cpus.name = 'cpus'
        cpus.type = 'SCALAR'
        cpus.scalar.value = job.resources.cores

        task.resources.append(addict.Dict())
        disk = task.resources[-1]
        disk.name = 'disk'
        disk.type = 'SCALAR'
        if b_to_mib(job.resources.disk) > 1:
            disk.scalar.value = b_to_mib(job.resources.disk)
        else:
            log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.disk)
            disk.scalar.value = 1

        task.resources.append(addict.Dict())
        mem = task.resources[-1]
        mem.name = 'mem'
        mem.type = 'SCALAR'
        if b_to_mib(job.resources.memory) > 1:
            mem.scalar.value = b_to_mib(job.resources.memory)
        else:
            log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.memory)
            mem.scalar.value = 1
        return task

    def statusUpdate(self, driver, update):
        """
        Invoked when the status of a task has changed (e.g., a agent is lost and so the task is
        lost, a task finishes and an executor sends a status update saying so, etc). Note that
        returning from this callback _acknowledges_ receipt of this status update! If for
        whatever reason the scheduler aborts during this callback (or the process exits) another
        status update will be delivered (note, however, that this is currently not true if the
        agent sending the status update is lost/fails during that time).
        """
        jobID = int(update.task_id.value)
        log.debug("Job %i is in state '%s' due to reason '%s'.", jobID, update.state, update.reason)

        def jobEnded(_exitStatus, wallTime=None, exitReason=None):
            """
            Notify external observers of the job ending.
            """
            self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason))
            agentIP = None
            try:
                agentIP = self.runningJobMap[jobID].agentIP
            except KeyError:
                log.warning("Job %i returned exit code %i but isn't tracked as running.",
                            jobID, _exitStatus)
            else:
                # Mark the job as no longer running. We MUST do this BEFORE
                # saying we killed the job, or it will be possible for another
                # thread to kill a job and then see it as running.
                del self.runningJobMap[jobID]

            try:
                self.hostToJobIDs[agentIP].remove(jobID)
            except KeyError:
                log.warning("Job %i returned exit code %i from unknown host.",
                            jobID, _exitStatus)

            try:
                self.killJobIds.remove(jobID)
            except KeyError:
                pass
            else:
                # We were asked to kill this job, so say that we have done so.
                # We do this LAST, after all status updates for the job have
                # been handled, to ensure a consistent view of the scheduler
                # state from other threads.
                self.killedJobIds.add(jobID)

        if update.state == 'TASK_FINISHED':
            # We get the running time of the job via the timestamp, which is in job-local time in seconds
            labels = update.labels.labels
            wallTime = None
            for label in labels:
                if label['key'] == 'wallTime':
                    wallTime = float(label['value'])
                    break
            assert(wallTime is not None)
            jobEnded(0, wallTime=wallTime, exitReason=BatchJobExitReason.FINISHED)
        elif update.state == 'TASK_FAILED':
            try:
                exitStatus = int(update.message)
            except ValueError:
                exitStatus = EXIT_STATUS_UNAVAILABLE_VALUE
                log.warning("Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
                            jobID, update.message, update.reason,
                            update.executor_id, update.agent_id)
            else:
                log.warning("Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
                            jobID, exitStatus,
                            update.message, update.reason,
                            update.executor_id, update.agent_id)

            jobEnded(exitStatus, exitReason=BatchJobExitReason.FAILED)
        elif update.state == 'TASK_LOST':
            log.warning("Job %i is lost.", jobID)
            jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=BatchJobExitReason.LOST)
        elif update.state in ('TASK_KILLED', 'TASK_ERROR'):
            log.warning("Job %i is in unexpected state %s with message '%s' due to reason '%s'.",
                        jobID, update.state, update.message, update.reason)
            jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE,
                     exitReason=(BatchJobExitReason.KILLED if update.state == 'TASK_KILLED' else BatchJobExitReason.ERROR))

        if 'limitation' in update:
            log.warning("Job limit info: %s" % update.limitation)

    def frameworkMessage(self, driver, executorId, agentId, message):
        """
        Invoked when an executor sends a message.
        """

        # Take it out of base 64 encoding from Protobuf
        message = decode_data(message).decode()

        log.debug('Got framework message from executor %s running on agent %s: %s',
                  executorId.value, agentId.value, message)
        message = ast.literal_eval(message)
        assert isinstance(message, dict)
        # Handle the mandatory fields of a message
        nodeAddress = message.pop('address')
        executor = self._registerNode(nodeAddress, agentId.value)
        # Handle optional message fields
        for k, v in message.items():
            if k == 'nodeInfo':
                assert isinstance(v, dict)
                resources = [taskData for taskData in self.runningJobMap.values()
                             if taskData.executorID == executorId.value]
                requestedCores = sum(taskData.cores for taskData in resources)
                requestedMemory = sum(taskData.memory for taskData in resources)
                executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v)
                self.executors[nodeAddress] = executor
            else:
                raise RuntimeError("Unknown message field '%s'." % k)

    def _registerNode(self, nodeAddress, agentId, nodePort=5051):
        """
        Called when we get communication from an agent. Remembers the
        information about the agent by address, and the agent address by agent
        ID.
        """
        executor = self.executors.get(nodeAddress)
        if executor is None or executor.agentId != agentId:
            executor = self.ExecutorInfo(nodeAddress=nodeAddress,
                                         agentId=agentId,
                                         nodeInfo=None,
                                         lastSeen=time.time())
            self.executors[nodeAddress] = executor
        else:
            executor.lastSeen = time.time()

        # Record the IP under the agent id
        self.agentsByID[agentId] = nodeAddress

        return executor

    def getNodes(self,
                 preemptable: Optional[bool] = None,
                 timeout: Optional[int] = None) -> Dict[str, NodeInfo]:
        """
        Return all nodes that match:
         - preemptable status (None includes all)
         - timeout period (seen within the last # seconds, or None for all)
        """
        nodes = dict()
        for node_ip, executor in self.executors.items():
            if preemptable is None or (preemptable == (executor.agentId not in self.nonPreemptableNodes)):
                if timeout is None or (time.time() - executor.lastSeen < timeout):
                    nodes[node_ip] = executor.nodeInfo
        return nodes

    def reregistered(self, driver, masterInfo):
        """
        Invoked when the scheduler re-registers with a newly elected Mesos master.
        """
        log.debug('Registered with new master')

    def _handleFailedExecutor(self, agentID, executorID=None):
        """
        Should be called when we find out an executor has failed.

        Gets the log from some container (since we are never handed a container
        ID) that ran on the given executor on the given agent, if the agent is
        still up, and dumps it to our log. All IDs are strings.

        If executorID is None, dumps all executors from the agent.

        Useful for debugging failing executor code.
        """

        log.warning("Handling failure of executor '%s' on agent '%s'.",
                    executorID, agentID)

        try:
            # Look up the IP. We should always know it unless we get answers
            # back without having accepted offers.
            agentAddress = self.agentsByID[agentID]

            # For now we assume the agent is always on the same port. We could
            # maybe sniff this from the URL that comes in the offer but it's
            # not guaranteed to be there.
            agentPort = 5051

            # We need the container ID to read the log, but we are never given
            # it, and I can't find a good way to list it, because the API only
            # seems to report running containers. So we dump all the available
            # files with /files/debug and look for one that looks right.
            filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % \
                (agentAddress, agentPort)

            # Download all the root mount points, which are in an object from
            # mounted name to real name
            filesDict = json.loads(urlopen(filesQueryURL).read())

            log.debug('Available files: %s', repr(filesDict.keys()))

            # Generate filenames for each container pointing to where stderr should be
            stderrFilenames = []
            # And look for the actual agent logs.
            agentLogFilenames = []
            for filename in filesDict:
                if (self.frameworkId in filename and agentID in filename and
                    (executorID is None or executorID in filename)):

                    stderrFilenames.append("%s/stderr" % filename)
                elif filename.endswith("log"):
                    agentLogFilenames.append(filename)

            if len(stderrFilenames) == 0:
                log.warning("Could not find any containers in '%s'." % filesDict)

            for stderrFilename in stderrFilenames:
                try:

                    # According to
                    # http://mesos.apache.org/documentation/latest/sandbox/ we can use
                    # the web API to fetch the error log.
                    errorLogURL = "http://%s:%d/files/download?path=%s" % \
                        (agentAddress, agentPort, quote_plus(stderrFilename))

                    log.warning("Attempting to retrieve executor error log: %s", errorLogURL)

                    for line in urlopen(errorLogURL):
                        # Warn all the lines of the executor's error log
                        log.warning("Executor: %s", line.rstrip())

                except Exception as e:
                    log.warning("Could not retrieve exceutor log due to: '%s'.", e)
                    log.warning(traceback.format_exc())

            for agentLogFilename in agentLogFilenames:
                try:
                    agentLogURL = "http://%s:%d/files/download?path=%s" % \
                        (agentAddress, agentPort, quote_plus(agentLogFilename))

                    log.warning("Attempting to retrieve agent log: %s", agentLogURL)

                    for line in urlopen(agentLogURL):
                        # Warn all the lines of the agent's log
                        log.warning("Agent: %s", line.rstrip())
                except Exception as e:
                    log.warning("Could not retrieve agent log due to: '%s'.", e)
                    log.warning(traceback.format_exc())

        except Exception as e:
            log.warning("Could not retrieve logs due to: '%s'.", e)
            log.warning(traceback.format_exc())

    def executorLost(self, driver, executorId, agentId, status):
        """
        Invoked when an executor has exited/terminated abnormally.
        """

        failedId = executorId.get('value', None)

        log.warning("Executor '%s' reported lost with status '%s'.", failedId, status)

        self._handleFailedExecutor(agentId.value, failedId)

    @classmethod
    def get_default_mesos_endpoint(cls) -> str:
        """
        Get the default IP/hostname and port that we will look for Mesos at.
        """
        return f'{get_public_ip()}:5050'

    @classmethod
    def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
        parser.add_argument("--mesosEndpoint", "--mesosMaster", dest="mesos_endpoint", default=cls.get_default_mesos_endpoint(),
                            help="The host and port of the Mesos master separated by colon.  (default: %(default)s)")

    @classmethod
    def setOptions(cls, setOption):
        setOption("mesos_endpoint", None, None, cls.get_default_mesos_endpoint(), old_names=["mesosMasterAddress"])
Example #6
0
class MesosBatchSystem(BatchSystemSupport,
                       AbstractScalableBatchSystem,
                       mesos.interface.Scheduler):
    """
    A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos
    tasks over a cluster of slave nodes. A Mesos framework consists of a scheduler and an
    executor. This class acts as the scheduler and is typically run on the master node that also
    runs the Mesos master process with which the scheduler communicates via a driver component.
    The executor is implemented in a separate class. It is run on each slave node and
    communicates with the Mesos slave process via another driver object. The scheduler may also
    be run on a separate node from the master, which we then call somewhat ambiguously the driver
    node.
    """

    @classmethod
    def supportsHotDeployment(cls):
        return True

    @classmethod
    def supportsWorkerCleanup(cls):
        return True

    class ExecutorInfo(object):
        def __init__(self, nodeAddress, slaveId, nodeInfo, lastSeen):
            super(MesosBatchSystem.ExecutorInfo, self).__init__()
            self.nodeAddress = nodeAddress
            self.slaveId = slaveId
            self.nodeInfo = nodeInfo
            self.lastSeen = lastSeen

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)

        # The hot-deployed resource representing the user script. Will be passed along in every
        # Mesos task. Also see setUserScript().
        self.userScript = None
        """
        :type: toil.resource.Resource
        """

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueues = JobQueue()

        # Address of the Mesos master in the form host:port where host can be an IP or a hostname
        self.mesosMasterAddress = config.mesosMasterAddress

        # Written to when Mesos kills tasks, as directed by Toil
        self.killedJobIds = set()

        # The IDs of job to be killed
        self.killJobIds = set()

        # Contains jobs on which killBatchJobs were called, regardless of whether or not they
        # actually were killed or ended by themselves
        self.intendedKill = set()

        # Map of host address to job ids
        # this is somewhat redundant since Mesos returns the number of workers per
        # node. However, that information isn't guaranteed to reach the leader,
        # so we also track the state here. When the information is returned from
        # mesos, prefer that information over this attempt at state tracking.
        self.hostToJobIDs = {}

        # see self.setNodeFilter
        self.nodeFilter = []

        # Dict of launched jobIDs to TaskData objects
        self.runningJobMap = {}

        # Mesos has no easy way of getting a task's resources so we track them here
        self.taskResources = {}

        # Queue of jobs whose status has been updated, according to Mesos
        self.updatedJobsQueue = Queue()

        # The Mesos driver used by this scheduler
        self.driver = None

        # A dictionary mapping a node's IP to an ExecutorInfo object describing important
        # properties of our executor running on that node. Only an approximation of the truth.
        self.executors = {}

        # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an
        #  approximation of the truth. Recently launched nodes may be absent from this set for a
        # while and a node's absence from this set does not imply its preemptability. But it is
        # generally safer to assume a node is preemptable since non-preemptability is a stronger
        # requirement. If we tracked the set of preemptable nodes instead, we'd have to use
        # absence as an indicator of non-preemptability and could therefore be misled into
        # believeing that a recently launched preemptable node was non-preemptable.
        self.nonPreemptableNodes = set()

        self.executor = self._buildExecutor()

        self.unusedJobID = itertools.count()
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # These control how frequently to log a message that would indicate if no jobs are
        # currently able to run on the offers given. This can happen if the cluster is busy
        # or if the nodes in the cluster simply don't have enough resources to run the jobs
        self.lastTimeOfferLogged = 0
        self.logPeriod = 30  # seconds

        self._startDriver()

    def setUserScript(self, userScript):
        self.userScript = userScript

    def issueBatchJob(self, jobNode):
        """
        Issues the following command returning a unique jobID. Command is the string to run, memory
        is an int giving the number of bytes the job needs to run in and cores is the number of cpus
        needed for the job and error-file is the path of the file to place any std-err/std-out in.
        """
        self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk)
        jobID = next(self.unusedJobID)
        job = ToilJob(jobID=jobID,
                      name=str(jobNode),
                      resources=ResourceRequirement(**jobNode._requirements),
                      command=jobNode.command,
                      userScript=self.userScript,
                      environment=self.environment.copy(),
                      workerCleanupInfo=self.workerCleanupInfo)
        jobType = job.resources
        log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))

        # TODO: round all elements of resources

        self.jobQueues.insertJob(job, jobType)
        self.taskResources[jobID] = job.resources
        log.debug("... queued")
        return jobID

    def killBatchJobs(self, jobIDs):
        # FIXME: probably still racy
        assert self.driver is not None
        localSet = set()
        for jobID in jobIDs:
            self.killJobIds.add(jobID)
            localSet.add(jobID)
            self.intendedKill.add(jobID)
            # FIXME: a bit too expensive for my taste
            if jobID in self.getIssuedBatchJobIDs():
                taskId = mesos_pb2.TaskID()
                taskId.value = str(jobID)
                self.driver.killTask(taskId)
            else:
                self.killJobIds.remove(jobID)
                localSet.remove(jobID)
        while localSet:
            intersection = localSet.intersection(self.killedJobIds)
            if intersection:
                localSet -= intersection
                self.killedJobIds -= intersection
            else:
                time.sleep(1)

    def getIssuedBatchJobIDs(self):
        jobIds = set(self.jobQueues.jobIDs())
        jobIds.update(list(self.runningJobMap.keys()))
        return list(jobIds)

    def getRunningBatchJobIDs(self):
        currentTime = dict()
        for jobID, data in list(self.runningJobMap.items()):
            currentTime[jobID] = time.time() - data.startTime
        return currentTime

    def getUpdatedBatchJob(self, maxWait):
        while True:
            try:
                item = self.updatedJobsQueue.get(timeout=maxWait)
            except Empty:
                return None
            jobId, exitValue, wallTime = item
            try:
                self.intendedKill.remove(jobId)
            except KeyError:
                log.debug('Job %s ended with status %i, took %s seconds.', jobId, exitValue,
                          '???' if wallTime is None else str(wallTime))
                return item
            else:
                log.debug('Job %s ended naturally before it could be killed.', jobId)

    def nodeInUse(self, nodeIP):
        return nodeIP in self.hostToJobIDs

    @contextmanager
    def nodeFiltering(self, filter):
        self.nodeFilter = [filter]
        yield
        self.nodeFilter = []

    def getWaitDuration(self):
        """
        Gets the period of time to wait (floating point, in seconds) between checking for
        missing/overlong jobs.
        """
        return self.reconciliationPeriod

    @classmethod
    def getRescueBatchJobFrequency(cls):
        return 30 * 60  # Half an hour

    def _buildExecutor(self):
        """
        Creates and returns an ExecutorInfo instance representing our executor implementation.
        """
        # The executor program is installed as a setuptools entry point by setup.py
        info = mesos_pb2.ExecutorInfo()
        info.name = "toil"
        info.command.value = resolveEntryPoint('_toil_mesos_executor')
        info.executor_id.value = "toil-%i" % os.getpid()
        info.source = pwd.getpwuid(os.getuid()).pw_name
        return info

    def _startDriver(self):
        """
        The Mesos driver thread which handles the scheduler's communication with the Mesos master
        """
        framework = mesos_pb2.FrameworkInfo()
        framework.user = ""  # Have Mesos fill in the current user.
        framework.name = "toil"
        framework.principal = framework.name
        self.driver = mesos.native.MesosSchedulerDriver(self,
                                                        framework,
                                                        self._resolveAddress(self.mesosMasterAddress),
                                                        True)  # enable implicit acknowledgements
        assert self.driver.start() == mesos_pb2.DRIVER_RUNNING

    @staticmethod
    def _resolveAddress(address):
        """
        Resolves the host in the given string. The input is of the form host[:port]. This method
        is idempotent, i.e. the host may already be a dotted IP address.

        >>> # noinspection PyProtectedMember
        >>> f=MesosBatchSystem._resolveAddress
        >>> f('localhost')
        '127.0.0.1'
        >>> f('127.0.0.1')
        '127.0.0.1'
        >>> f('localhost:123')
        '127.0.0.1:123'
        >>> f('127.0.0.1:123')
        '127.0.0.1:123'
        """
        address = address.split(':')
        assert len(address) in (1, 2)
        address[0] = socket.gethostbyname(address[0])
        return ':'.join(address)

    def shutdown(self):
        log.debug("Stopping Mesos driver")
        self.driver.stop()
        log.debug("Joining Mesos driver")
        driver_result = self.driver.join()
        log.debug("Joined Mesos driver")
        if driver_result != mesos_pb2.DRIVER_STOPPED:
            raise RuntimeError("Mesos driver failed with %i", driver_result)

    def registered(self, driver, frameworkId, masterInfo):
        """
        Invoked when the scheduler successfully registers with a Mesos master
        """
        log.debug("Registered with framework ID %s", frameworkId.value)

    def _declineAllOffers(self, driver, offers):
        for offer in offers:
            log.debug("Declining offer %s.", offer.id.value)
            driver.declineOffer(offer.id)

    def _parseOffer(self, offer):
        cores = 0
        memory = 0
        disk = 0
        preemptable = None
        for attribute in offer.attributes:
            if attribute.name == 'preemptable':
                assert preemptable is None, "Attribute 'preemptable' occurs more than once."
                preemptable = strict_bool(attribute.text.value)
        if preemptable is None:
            log.debug('Slave not marked as either preemptable or not. Assuming non-preemptable.')
            preemptable = False
        for resource in offer.resources:
            if resource.name == "cpus":
                cores += resource.scalar.value
            elif resource.name == "mem":
                memory += resource.scalar.value
            elif resource.name == "disk":
                disk += resource.scalar.value
        return cores, memory, disk, preemptable

    def _prepareToRun(self, jobType, offer):
        # Get the first element to insure FIFO
        job = self.jobQueues.nextJobOfType(jobType)
        task = self._newMesosTask(job, offer)
        return task

    def _updateStateToRunning(self, offer, runnableTasks):
        for task in runnableTasks:
            resourceKey = int(task.task_id.value)
            resources = self.taskResources[resourceKey]
            slaveIP = socket.gethostbyname(offer.hostname)
            try:
                self.hostToJobIDs[slaveIP].append(resourceKey)
            except KeyError:
                self.hostToJobIDs[slaveIP] = [resourceKey]

            self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(),
                                                                   slaveID=offer.slave_id.value,
                                                                   slaveIP=slaveIP,
                                                                   executorID=task.executor.executor_id.value,
                                                                   cores=resources.cores,
                                                                   memory=resources.memory)
            del self.taskResources[resourceKey]
            log.debug('Launched Mesos task %s.', task.task_id.value)

    def resourceOffers(self, driver, offers):
        """
        Invoked when resources have been offered to this framework.
        """
        self._trackOfferedNodes(offers)

        jobTypes = self.jobQueues.sorted()

        # TODO: We may want to assert that numIssued >= numRunning
        if not jobTypes or len(self.getIssuedBatchJobIDs()) == len(self.getRunningBatchJobIDs()):
            log.debug('There are no queued tasks. Declining Mesos offers.')
            # Without jobs, we can get stuck with no jobs and no new offers until we decline it.
            self._declineAllOffers(driver, offers)
            return

        unableToRun = True
        # Right now, gives priority to largest jobs
        for offer in offers:
            runnableTasks = []
            # TODO: In an offer, can there ever be more than one resource with the same name?
            offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer)
            log.debug('Got offer %s for a %spreemptable slave with %.2f MiB memory, %.2f core(s) '
                      'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-',
                      offerMemory, offerCores, offerDisk)
            remainingCores = offerCores
            remainingMemory = offerMemory
            remainingDisk = offerDisk

            for jobType in jobTypes:
                runnableTasksOfType = []
                # Because we are not removing from the list until outside of the while loop, we
                # must decrement the number of jobs left to run ourselves to avoid an infinite
                # loop.
                nextToLaunchIndex = 0
                # Toil specifies disk and memory in bytes but Mesos uses MiB
                while ( not self.jobQueues.typeEmpty(jobType)
                       # On a non-preemptable node we can run any job, on a preemptable node we
                       # can only run preemptable jobs:
                       and (not offerPreemptable or jobType.preemptable)
                       and remainingCores >= jobType.cores
                       and remainingDisk >= toMiB(jobType.disk)
                       and remainingMemory >= toMiB(jobType.memory)):
                    task = self._prepareToRun(jobType, offer)
                    # TODO: this used to be a conditional but Hannes wanted it changed to an assert
                    # TODO: ... so we can understand why it exists.
                    assert int(task.task_id.value) not in self.runningJobMap
                    runnableTasksOfType.append(task)
                    log.debug("Preparing to launch Mesos task %s using offer %s ...",
                              task.task_id.value, offer.id.value)
                    remainingCores -= jobType.cores
                    remainingMemory -= toMiB(jobType.memory)
                    remainingDisk -= toMiB(jobType.disk)
                    nextToLaunchIndex += 1
                else:
                    log.debug('Offer %(offer)s not suitable to run the tasks with requirements '
                              '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores '
                              'and %(disk)s of disk on a %(non)spreemptable slave.',
                              dict(offer=offer.id.value,
                                   requirements=jobType.__dict__,
                                   non='' if offerPreemptable else 'non-',
                                   memory=fromMiB(offerMemory),
                                   cores=offerCores,
                                   disk=fromMiB(offerDisk)))
                runnableTasks.extend(runnableTasksOfType)
            # Launch all runnable tasks together so we only call launchTasks once per offer
            if runnableTasks:
                unableToRun = False
                driver.launchTasks(offer.id, runnableTasks)
                self._updateStateToRunning(offer, runnableTasks)
            else:
                log.debug('Although there are queued jobs, none of them could be run with offer %s '
                          'extended to the framework.', offer.id)
                driver.declineOffer(offer.id)

        if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod):
            self.lastTimeOfferLogged = time.time()
            log.debug('Although there are queued jobs, none of them were able to run in '
                     'any of the offers extended to the framework. There are currently '
                     '%i jobs running. Enable debug level logging to see more details about '
                     'job types and offers received.', len(self.runningJobMap))

    def _trackOfferedNodes(self, offers):
        for offer in offers:
            nodeAddress = socket.gethostbyname(offer.hostname)
            self._registerNode(nodeAddress, offer.slave_id.value)
            preemptable = False
            for attribute in offer.attributes:
                if attribute.name == 'preemptable':
                    preemptable = strict_bool(attribute.text.value)
            if preemptable:
                try:
                    self.nonPreemptableNodes.remove(offer.slave_id.value)
                except KeyError:
                    pass
            else:
                self.nonPreemptableNodes.add(offer.slave_id.value)

    def _filterOfferedNodes(self, offers):
        if not self.nodeFilter:
            return offers
        executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers]
        executorInfos = [_f for _f in executorInfoOrNone if _f]
        executorsToConsider = list(filter(self.nodeFilter[0], executorInfos))
        ipsToConsider = {ex.nodeAddress for ex in executorsToConsider}
        return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider]

    def _newMesosTask(self, job, offer):
        """
        Build the Mesos task object for a given the Toil job and Mesos offer
        """
        task = mesos_pb2.TaskInfo()
        task.task_id.value = str(job.jobID)
        task.slave_id.value = offer.slave_id.value
        task.name = job.name
        task.data = pickle.dumps(job)
        task.executor.MergeFrom(self.executor)

        cpus = task.resources.add()
        cpus.name = "cpus"
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = job.resources.cores

        disk = task.resources.add()
        disk.name = "disk"
        disk.type = mesos_pb2.Value.SCALAR
        if toMiB(job.resources.disk) > 1:
            disk.scalar.value = toMiB(job.resources.disk)
        else:
            log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.disk)
            disk.scalar.value = 1
        mem = task.resources.add()
        mem.name = "mem"
        mem.type = mesos_pb2.Value.SCALAR
        if toMiB(job.resources.memory) > 1:
            mem.scalar.value = toMiB(job.resources.memory)
        else:
            log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
                        job.jobID, job.resources.memory)
            mem.scalar.value = 1
        return task

    def statusUpdate(self, driver, update):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost and so the task is
        lost, a task finishes and an executor sends a status update saying so, etc). Note that
        returning from this callback _acknowledges_ receipt of this status update! If for
        whatever reason the scheduler aborts during this callback (or the process exits) another
        status update will be delivered (note, however, that this is currently not true if the
        slave sending the status update is lost/fails during that time).
        """
        jobID = int(update.task_id.value)
        stateName = mesos_pb2.TaskState.Name(update.state)
        log.debug("Job %i is in state '%s'.", jobID, stateName)

        def jobEnded(_exitStatus, wallTime=None):
            try:
                self.killJobIds.remove(jobID)
            except KeyError:
                pass
            else:
                self.killedJobIds.add(jobID)
            self.updatedJobsQueue.put((jobID, _exitStatus, wallTime))
            slaveIP = None
            try:
                slaveIP = self.runningJobMap[jobID].slaveIP
            except KeyError:
                log.warning("Job %i returned exit code %i but isn't tracked as running.",
                            jobID, _exitStatus)
            else:
                del self.runningJobMap[jobID]

            try:
                self.hostToJobIDs[slaveIP].remove(jobID)
            except KeyError:
                log.warning("Job %i returned exit code %i from unknown host.",
                            jobID, _exitStatus)

        if update.state == mesos_pb2.TASK_FINISHED:
            jobEnded(0, wallTime=unpack('d', update.data)[0])
        elif update.state == mesos_pb2.TASK_FAILED:
            try:
                exitStatus = int(update.message)
            except ValueError:
                exitStatus = 255
                log.warning("Job %i failed with message '%s'", jobID, update.message)
            else:
                log.warning('Job %i failed with exit status %i', jobID, exitStatus)
            jobEnded(exitStatus)
        elif update.state in (mesos_pb2.TASK_LOST, mesos_pb2.TASK_KILLED, mesos_pb2.TASK_ERROR):
            log.warning("Job %i is in unexpected state %s with message '%s'.",
                        jobID, stateName, update.message)
            jobEnded(255)

    def frameworkMessage(self, driver, executorId, slaveId, message):
        """
        Invoked when an executor sends a message.
        """
        log.debug('Got framework message from executor %s running on slave %s: %s',
                  executorId.value, slaveId.value, message)
        message = ast.literal_eval(message)
        assert isinstance(message, dict)
        # Handle the mandatory fields of a message
        nodeAddress = message.pop('address')
        executor = self._registerNode(nodeAddress, slaveId.value)
        # Handle optional message fields
        for k, v in iteritems(message):
            if k == 'nodeInfo':
                assert isinstance(v, dict)
                resources = [taskData for taskData in itervalues(self.runningJobMap)
                             if taskData.executorID == executorId.value]
                requestedCores = sum(taskData.cores for taskData in resources)
                requestedMemory = sum(taskData.memory for taskData in resources)
                executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v)
                self.executors[nodeAddress] = executor
            else:
                raise RuntimeError("Unknown message field '%s'." % k)

    def _registerNode(self, nodeAddress, slaveId):
        executor = self.executors.get(nodeAddress)
        if executor is None or executor.slaveId != slaveId:
            executor = self.ExecutorInfo(nodeAddress=nodeAddress,
                                         slaveId=slaveId,
                                         nodeInfo=None,
                                         lastSeen=time.time())
            self.executors[nodeAddress] = executor
        else:
            executor.lastSeen = time.time()
        return executor

    def getNodes(self, preemptable=None, timeout=600):
        timeout = timeout or sys.maxsize
        return {nodeAddress: executor.nodeInfo
                for nodeAddress, executor in iteritems(self.executors)
                if time.time() - executor.lastSeen < timeout
                and (preemptable is None
                     or preemptable == (executor.slaveId not in self.nonPreemptableNodes))}

    def reregistered(self, driver, masterInfo):
        """
        Invoked when the scheduler re-registers with a newly elected Mesos master.
        """
        log.debug('Registered with new master')

    def executorLost(self, driver, executorId, slaveId, status):
        """
        Invoked when an executor has exited/terminated.
        """
        log.warning("Executor '%s' lost.", executorId)


    @classmethod
    def setOptions(cl, setOption):
        setOption("mesosMasterAddress", None, None, 'localhost:5050')