Example #1
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    help = 'Launches the Scale scheduler'

    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # Set up global shutdown
        global GLOBAL_SHUTDOWN
        GLOBAL_SHUTDOWN = self._shutdown

        logger.info('Scale Scheduler %s', settings.VERSION)

        self.run_scheduler(settings.MESOS_MASTER)

    def run_scheduler(self, mesos_master):
        logger.info("Scale rising...")
        self.scheduler = ScaleScheduler()
        self.scheduler.initialize()
        scheduler_mgr.hostname = socket.getfqdn()

        logger.info('Connecting to Mesos master at %s:', mesos_master)

        # By default use ZK for master detection
        self.client = MesosClient(
            mesos_urls=[settings.MESOS_MASTER],
            # We have to run tasks as root, so docker commands may be executed
            frameworkUser='******',
            frameworkName=settings.FRAMEWORK_NAME,
            frameworkHostname=scheduler_mgr.hostname,
            frameworkWebUI=settings.WEBSERVER_ADDRESS)
        if settings.SERVICE_SECRET:
            # We are in Enterprise mode and using service account
            self.client.set_service_account(json.loads(
                settings.SERVICE_SECRET))
        elif settings.PRINCIPAL and settings.SECRET:
            self.client.set_credentials(settings.PRINCIPAL, settings.SECRET)

        mesos_role = settings.MESOS_ROLE
        logger.info('Launching scheduler with role: %s' % mesos_role)
        self.client.set_role(settings.MESOS_ROLE)

        logger.info('Accepting offers from role: %s' %
                    settings.ACCEPTED_RESOURCE_ROLE)

        self.client.add_capability('GPU_RESOURCES')

        try:
            self.scheduler.run(self.client)
            status = 0
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown()
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        return status
Example #2
0
class ESPAFramework(object):
    def __init__(self, cfg, espa_api, worklist):
        master = cfg.get('mesos_master')
        principal = cfg.get('mesos_principal')
        secret = cfg.get('mesos_secret')

        self.workList = worklist
        self.runningList = {}
        self.max_cpus = cfg.get('max_cpu')
        self.required_cpus = cfg.get('task_cpu')
        self.required_memory = cfg.get('task_mem')
        self.required_disk = cfg.get('task_disk')
        self.task_image = cfg.get('task_image')
        self.refuse_seconds = cfg.get('offer_refuse_seconds')
        self.request_count = cfg.get('product_request_count')
        self.products = cfg.get('product_frequency')
        self.healthy_states = [
            "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED"
        ]
        self.espa = espa_api
        self.cfg = cfg

        self.client = MesosClient(mesos_urls=[master],
                                  frameworkName='ESPA Mesos Framework')
        self.client.verify = False
        self.client.set_credentials(principal, secret)
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)

        # put some work on the queue
        get_products_to_process(cfg, self.espa, self.workList)

    def _getResource(self, res, name):
        for r in res:
            if r['name'] == name:
                return r['scalar']['value']
        return 0.0

    def _updateResource(self, res, name, value):
        if value <= 0:
            return
        for r in res:
            if r['name'] == name:
                r['scalar']['value'] -= value
        return

    def subscribed(self, driver):
        log.warning('SUBSCRIBED')
        self.driver = driver

    def core_limit_reached(self):
        running_count = len(self.runningList)
        task_core_count = self.required_cpus
        core_utilization = running_count * task_core_count
        resp = False

        log.debug("Number of cores being used: {}".format(core_utilization))
        if core_utilization >= self.max_cpus:
            log.debug("Max number of cores being used. Max = {}".format(
                self.max_cpus))
            resp = True

        return resp

    def accept_offer(self, offer):
        accept = True
        resources = offer.get('resources')
        if self.required_cpus != 0:
            cpu = self._getResource(resources, "cpus")
            if self.required_cpus > cpu:
                accept = False
        if self.required_memory != 0:
            mem = self._getResource(resources, "mem")
            if self.required_memory > mem:
                accept = False
        if self.required_disk != 0:
            disk = self._getResource(resources, "disk")
            if self.required_disk > disk:
                accept = False
        if (accept == True):
            self._updateResource(resources, "cpus", self.required_cpus)
            self._updateResource(resources, "mem", self.required_memory)
            self._updateResource(resources, "disk", self.required_disk)

        return accept

    def decline_offer(self, offer):
        options = {'filters': {'refuse_seconds': self.refuse_seconds}}
        log.debug("declining offer: {} with options: {}".format(
            offer, options))
        try:
            offer.decline(options)
        except Exception as error:
            log.error(
                "Exception encountered declining offer: {}, error: {}".format(
                    offer, error))
            raise
        return True

    def offer_received(self, offers):
        response = addict.Dict()
        response.offers.length = len(offers)
        response.offers.accepted = 0
        log.debug("Received {} new offers...".format(response.offers.length))

        # check to see if Mesos tasks are enabled
        if self.espa.mesos_tasks_disabled():
            # decline the offers to free up the resources
            log.debug("mesos tasks disabled, declining {} offers".format(
                len(offers)))
            for offer in offers:
                self.decline_offer(offer)
            response.tasks.enabled = False
            return response
        else:
            response.tasks.enabled = True

        # check to see if core limit has been reached
        if self.core_limit_reached():
            # decline the offers to free up the resources
            log.debug(
                "Core utilization limit reached, declining {} offers".format(
                    len(offers)))
            for offer in offers:
                self.decline_offer(offer)
            response.tasks.enabled = False
            return response
        else:
            response.tasks.enabled = True

        for offer in offers:
            mesos_offer = offer.get_offer()
            if self.accept_offer(mesos_offer):
                log.debug("Acceptable offer, checking for work to do")
                try:
                    work = self.workList.get(
                        False
                    )  # will raise multiprocessing.Empty if no objects present
                    orderid = work.get('orderid')
                    scene = work.get('scene')
                    task_id = "{}_@@@_{}".format(orderid, scene)
                    new_task = task.build(task_id, mesos_offer,
                                          self.task_image, self.required_cpus,
                                          self.required_memory,
                                          self.required_disk, work, self.cfg)
                    log.debug("New Task definition: {}".format(new_task))
                    offer.accept([new_task])
                    self.espa.update_status(scene, orderid, 'tasked')
                    response.offers.accepted += 1
                except Empty:
                    log.debug("Work queue is empty, declining offer")
                    self.decline_offer(offer)
                except Exception as e:
                    log.error(
                        "Exception creating new task. offer: {}, exception: {}\n declining offer"
                        .format(offer, e))
                    self.decline_offer(offer)
            else:
                log.debug("Unacceptable offer, declining")
                self.decline_offer(offer)

        log.debug("resourceOffer response: {}".format(response))
        return response

    def status_update(self, update):
        # possible state values
        # http://mesos.apache.org/api/latest/java/org/apache/mesos/Protos.TaskState.html
        task_id = update['status']['task_id']['value']
        orderid, scene = task_id.split("_@@@_")
        state = update['status']['state']

        response = addict.Dict()
        response.task_id = task_id
        response.state = state

        if state in self.healthy_states:
            log.debug("status update for: {}  new status: {}".format(
                task_id, state))
            response.status = "healthy"

            if state == "TASK_RUNNING":
                response.list.name = "running"
                if task_id not in self.runningList:
                    self.runningList[task_id] = util.right_now()
                    response.list.status = "new"
                else:
                    response.list.status = "current"

            if state == "TASK_FINISHED":
                try:
                    self.runningList.__delitem__(task_id)
                except KeyError:
                    log.debug(
                        "Received TASK_FINISHED update for {}, which wasn't in the runningList"
                        .format(task_id))

        else:  # something abnormal happened
            log.error("abnormal task state for: {}, full update: {}".format(
                task_id, update))
            response.status = "unhealthy"
            self.espa.set_scene_error(scene, orderid, update)
            if task_id in self.runningList:
                self.runningList.__delitem__(task_id)

        return response