class Command(BaseCommand): """Command that launches the Scale scheduler """ help = 'Launches the Scale scheduler' def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. """ # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # Set up global shutdown global GLOBAL_SHUTDOWN GLOBAL_SHUTDOWN = self._shutdown logger.info('Scale Scheduler %s', settings.VERSION) self.run_scheduler(settings.MESOS_MASTER) def run_scheduler(self, mesos_master): logger.info("Scale rising...") self.scheduler = ScaleScheduler() self.scheduler.initialize() scheduler_mgr.hostname = socket.getfqdn() logger.info('Connecting to Mesos master at %s:', mesos_master) # By default use ZK for master detection self.client = MesosClient( mesos_urls=[settings.MESOS_MASTER], # We have to run tasks as root, so docker commands may be executed frameworkUser='******', frameworkName=settings.FRAMEWORK_NAME, frameworkHostname=scheduler_mgr.hostname, frameworkWebUI=settings.WEBSERVER_ADDRESS) if settings.SERVICE_SECRET: # We are in Enterprise mode and using service account self.client.set_service_account(json.loads( settings.SERVICE_SECRET)) elif settings.PRINCIPAL and settings.SECRET: self.client.set_credentials(settings.PRINCIPAL, settings.SECRET) mesos_role = settings.MESOS_ROLE logger.info('Launching scheduler with role: %s' % mesos_role) self.client.set_role(settings.MESOS_ROLE) logger.info('Accepting offers from role: %s' % settings.ACCEPTED_RESOURCE_ROLE) self.client.add_capability('GPU_RESOURCES') try: self.scheduler.run(self.client) status = 0 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown() status = status or shutdown_status logger.info('Exiting...') sys.exit(status) def _onsigterm(self, signum, _frame): """See signal callback registration: :py:func:`signal.signal`. This callback performs a clean shutdown when a TERM signal is received. """ logger.info('Scheduler command terminated due to signal: %i', signum) self._shutdown() sys.exit(1) def _shutdown(self): """Performs any clean up required by this command. :returns: The exit status code based on whether the shutdown operation was clean with no exceptions. :rtype: int """ status = 0 try: if self.scheduler: self.scheduler.shutdown() except: logger.exception('Failed to properly shutdown Scale scheduler.') status = 1 return status
class ESPAFramework(object): def __init__(self, cfg, espa_api, worklist): master = cfg.get('mesos_master') principal = cfg.get('mesos_principal') secret = cfg.get('mesos_secret') self.workList = worklist self.runningList = {} self.max_cpus = cfg.get('max_cpu') self.required_cpus = cfg.get('task_cpu') self.required_memory = cfg.get('task_mem') self.required_disk = cfg.get('task_disk') self.task_image = cfg.get('task_image') self.refuse_seconds = cfg.get('offer_refuse_seconds') self.request_count = cfg.get('product_request_count') self.products = cfg.get('product_frequency') self.healthy_states = [ "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED" ] self.espa = espa_api self.cfg = cfg self.client = MesosClient(mesos_urls=[master], frameworkName='ESPA Mesos Framework') self.client.verify = False self.client.set_credentials(principal, secret) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) # put some work on the queue get_products_to_process(cfg, self.espa, self.workList) def _getResource(self, res, name): for r in res: if r['name'] == name: return r['scalar']['value'] return 0.0 def _updateResource(self, res, name, value): if value <= 0: return for r in res: if r['name'] == name: r['scalar']['value'] -= value return def subscribed(self, driver): log.warning('SUBSCRIBED') self.driver = driver def core_limit_reached(self): running_count = len(self.runningList) task_core_count = self.required_cpus core_utilization = running_count * task_core_count resp = False log.debug("Number of cores being used: {}".format(core_utilization)) if core_utilization >= self.max_cpus: log.debug("Max number of cores being used. Max = {}".format( self.max_cpus)) resp = True return resp def accept_offer(self, offer): accept = True resources = offer.get('resources') if self.required_cpus != 0: cpu = self._getResource(resources, "cpus") if self.required_cpus > cpu: accept = False if self.required_memory != 0: mem = self._getResource(resources, "mem") if self.required_memory > mem: accept = False if self.required_disk != 0: disk = self._getResource(resources, "disk") if self.required_disk > disk: accept = False if (accept == True): self._updateResource(resources, "cpus", self.required_cpus) self._updateResource(resources, "mem", self.required_memory) self._updateResource(resources, "disk", self.required_disk) return accept def decline_offer(self, offer): options = {'filters': {'refuse_seconds': self.refuse_seconds}} log.debug("declining offer: {} with options: {}".format( offer, options)) try: offer.decline(options) except Exception as error: log.error( "Exception encountered declining offer: {}, error: {}".format( offer, error)) raise return True def offer_received(self, offers): response = addict.Dict() response.offers.length = len(offers) response.offers.accepted = 0 log.debug("Received {} new offers...".format(response.offers.length)) # check to see if Mesos tasks are enabled if self.espa.mesos_tasks_disabled(): # decline the offers to free up the resources log.debug("mesos tasks disabled, declining {} offers".format( len(offers))) for offer in offers: self.decline_offer(offer) response.tasks.enabled = False return response else: response.tasks.enabled = True # check to see if core limit has been reached if self.core_limit_reached(): # decline the offers to free up the resources log.debug( "Core utilization limit reached, declining {} offers".format( len(offers))) for offer in offers: self.decline_offer(offer) response.tasks.enabled = False return response else: response.tasks.enabled = True for offer in offers: mesos_offer = offer.get_offer() if self.accept_offer(mesos_offer): log.debug("Acceptable offer, checking for work to do") try: work = self.workList.get( False ) # will raise multiprocessing.Empty if no objects present orderid = work.get('orderid') scene = work.get('scene') task_id = "{}_@@@_{}".format(orderid, scene) new_task = task.build(task_id, mesos_offer, self.task_image, self.required_cpus, self.required_memory, self.required_disk, work, self.cfg) log.debug("New Task definition: {}".format(new_task)) offer.accept([new_task]) self.espa.update_status(scene, orderid, 'tasked') response.offers.accepted += 1 except Empty: log.debug("Work queue is empty, declining offer") self.decline_offer(offer) except Exception as e: log.error( "Exception creating new task. offer: {}, exception: {}\n declining offer" .format(offer, e)) self.decline_offer(offer) else: log.debug("Unacceptable offer, declining") self.decline_offer(offer) log.debug("resourceOffer response: {}".format(response)) return response def status_update(self, update): # possible state values # http://mesos.apache.org/api/latest/java/org/apache/mesos/Protos.TaskState.html task_id = update['status']['task_id']['value'] orderid, scene = task_id.split("_@@@_") state = update['status']['state'] response = addict.Dict() response.task_id = task_id response.state = state if state in self.healthy_states: log.debug("status update for: {} new status: {}".format( task_id, state)) response.status = "healthy" if state == "TASK_RUNNING": response.list.name = "running" if task_id not in self.runningList: self.runningList[task_id] = util.right_now() response.list.status = "new" else: response.list.status = "current" if state == "TASK_FINISHED": try: self.runningList.__delitem__(task_id) except KeyError: log.debug( "Received TASK_FINISHED update for {}, which wasn't in the runningList" .format(task_id)) else: # something abnormal happened log.error("abnormal task state for: {}, full update: {}".format( task_id, update)) response.status = "unhealthy" self.espa.set_scene_error(scene, orderid, update) if task_id in self.runningList: self.runningList.__delitem__(task_id) return response