def __init__(self, cfg, espa_api, worklist): master = cfg.get('mesos_master') principal = cfg.get('mesos_principal') secret = cfg.get('mesos_secret') self.workList = worklist self.runningList = {} self.max_cpus = cfg.get('max_cpu') self.required_cpus = cfg.get('task_cpu') self.required_memory = cfg.get('task_mem') self.required_disk = cfg.get('task_disk') self.task_image = cfg.get('task_image') self.refuse_seconds = cfg.get('offer_refuse_seconds') self.request_count = cfg.get('product_request_count') self.products = cfg.get('product_frequency') self.healthy_states = [ "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED" ] self.espa = espa_api self.cfg = cfg self.client = MesosClient(mesos_urls=[master], frameworkName='ESPA Mesos Framework') self.client.verify = False self.client.set_credentials(principal, secret) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) # put some work on the queue get_products_to_process(cfg, self.espa, self.workList)
def __init__(self, url="http://127.0.0.1:5050"): logging.basicConfig() self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) self.launched_tasks = 0 self.running_tasks = 0 self.finished_tasks = 0 self.killed_tasks = 0 self.lost_tasks = [] # self.failed_tasks = [] #signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.client = MesosClient(mesos_urls=[url], frameworkId=None, frameworkName='Python HTTP framework', frameworkUser=getpass.getuser()) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break
def __init__(self): logging.basicConfig() self.logger = logging.getLogger(__name__) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None # Note: leader.mesos address requires Mesos DNS #self.client = MesosClient(mesos_urls=['zk://leader.mesos:2181/mesos']) # If you are purely using Mesos, you should use explicit address of Master # Example: Zookeeper master discovery #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos']) # Example: Directly address Mesos #self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050']) # By default, use direct master addressing # Allow for comma delimited URLs to be passed in via MASTER_URLS # environment variable master_urls = os.getenv('MESOS_URLS', 'http://127.0.0.1:5050') self.client = MesosClient(mesos_urls=master_urls.split(',')) secret = os.getenv('SERVICE_SECRET') if secret: self.client.set_service_account(json.loads(secret)) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break
def __init__(self, url="http://127.0.0.1:5050", name="HTTP framework", user="******"): logging.basicConfig() self.logger = logging.getLogger(__name__) self.tasks = 0 #signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.mesos_offer = None # to store only one offer self.update = None self.task_id = None self.agent_id = None self.client = MesosClient(mesos_urls = [url], frameworkName = name, frameworkUser = user) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client)
def __init__(self): logging.basicConfig() self.logger = logging.getLogger(__name__) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.client = MesosClient(mesos_urls=['http://52.87.159.219:5050']) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break
def run_scheduler(self, mesos_master): logger.info("Scale rising...") self.scheduler = ScaleScheduler() self.scheduler.initialize() scheduler_mgr.hostname = socket.getfqdn() logger.info('Connecting to Mesos master at %s:', mesos_master) # By default use ZK for master detection self.client = MesosClient( mesos_urls=[settings.MESOS_MASTER], # We have to run tasks as root, so docker commands may be executed frameworkUser='******', frameworkName=settings.FRAMEWORK_NAME, frameworkHostname=scheduler_mgr.hostname, frameworkWebUI=settings.WEBSERVER_ADDRESS) if settings.SERVICE_SECRET: # We are in Enterprise mode and using service account self.client.set_service_account(json.loads( settings.SERVICE_SECRET)) elif settings.PRINCIPAL and settings.SECRET: self.client.set_credentials(settings.PRINCIPAL, settings.SECRET) mesos_role = settings.MESOS_ROLE logger.info('Launching scheduler with role: %s' % mesos_role) self.client.set_role(settings.MESOS_ROLE) logger.info('Accepting offers from role: %s' % settings.ACCEPTED_RESOURCE_ROLE) self.client.add_capability('GPU_RESOURCES') try: self.scheduler.run(self.client) status = 0 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown() status = status or shutdown_status logger.info('Exiting...') sys.exit(status)
def start(self): self.fprint("Start, connecting to:" + MESOS_MASTER) self.driver = None self.client = MesosClient(mesos_urls=[MESOS_MASTER]) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.client.frameworkName = "HPC Framework" self.th = HpcFramework.MesosFramework(self.client) self.th.start() while self.th.isAlive(): try: self.running = True self.th.join(1) except Exception as e: self.fprint("mesos framework exception" + str(e)) self.running = False self.fprint("mesos framework stopped")
def __init__(self, script_path, setup_path, headnode, ssl_thumbprint, client_cert, node_group=""): logging.basicConfig() self.logger = logging_aux.init_logger_aux("hpcframework", "hpcframework.log") # signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.node_idle_check_table = {} self.script_path = script_path self.setup_path = setup_path self.headnode = headnode self.ssl_thumbprint = ssl_thumbprint self.node_group = node_group self.hpc_client = HpcRestClient(client_cert) self.heartbeat_table = hpc_cluster_manager.HpcClusterManager( self.hpc_client, node_group=self.node_group) self.heartbeat_table.subscribe_node_closed_callback( lambda l: map(self._kill_task_by_hostname, l)) self.heartbeat_table.start() self.core_provisioning = 0.0 self.driver = None # type: MesosClient.SchedulerDriver framework_suffix = self.headnode.replace(',', '_') if self.node_group != "": framework_suffix = framework_suffix + '-' + self.node_group self.mesos_client = MesosClient( mesos_urls=['http://172.16.1.4:5050'], # mesos_urls=['zk://127.0.0.1:2181/mesos'], frameworkName="HPC-Pack-Framework-{}".format(framework_suffix)) self.mesos_client.on(MesosClient.SUBSCRIBED, self.subscribed) self.mesos_client.on(MesosClient.OFFERS, self.offer_received) self.mesos_client.on(MesosClient.UPDATE, self.status_update) self.th = HpcpackFramwork.MesosFramework(self.mesos_client) self.stop = False
class Command(BaseCommand): """Command that launches the Scale scheduler """ help = 'Launches the Scale scheduler' def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the scheduler. """ # Register a listener to handle clean shutdowns signal.signal(signal.SIGTERM, self._onsigterm) # Set up global shutdown global GLOBAL_SHUTDOWN GLOBAL_SHUTDOWN = self._shutdown logger.info('Scale Scheduler %s', settings.VERSION) self.run_scheduler(settings.MESOS_MASTER) def run_scheduler(self, mesos_master): logger.info("Scale rising...") self.scheduler = ScaleScheduler() self.scheduler.initialize() scheduler_mgr.hostname = socket.getfqdn() logger.info('Connecting to Mesos master at %s:', mesos_master) # By default use ZK for master detection self.client = MesosClient( mesos_urls=[settings.MESOS_MASTER], # We have to run tasks as root, so docker commands may be executed frameworkUser='******', frameworkName=settings.FRAMEWORK_NAME, frameworkHostname=scheduler_mgr.hostname, frameworkWebUI=settings.WEBSERVER_ADDRESS) if settings.SERVICE_SECRET: # We are in Enterprise mode and using service account self.client.set_service_account(json.loads( settings.SERVICE_SECRET)) elif settings.PRINCIPAL and settings.SECRET: self.client.set_credentials(settings.PRINCIPAL, settings.SECRET) mesos_role = settings.MESOS_ROLE logger.info('Launching scheduler with role: %s' % mesos_role) self.client.set_role(settings.MESOS_ROLE) logger.info('Accepting offers from role: %s' % settings.ACCEPTED_RESOURCE_ROLE) self.client.add_capability('GPU_RESOURCES') try: self.scheduler.run(self.client) status = 0 except: status = 1 logger.exception('Mesos Scheduler Driver returned an exception') #Perform a shut down and return any non-zero status shutdown_status = self._shutdown() status = status or shutdown_status logger.info('Exiting...') sys.exit(status) def _onsigterm(self, signum, _frame): """See signal callback registration: :py:func:`signal.signal`. This callback performs a clean shutdown when a TERM signal is received. """ logger.info('Scheduler command terminated due to signal: %i', signum) self._shutdown() sys.exit(1) def _shutdown(self): """Performs any clean up required by this command. :returns: The exit status code based on whether the shutdown operation was clean with no exceptions. :rtype: int """ status = 0 try: if self.scheduler: self.scheduler.shutdown() except: logger.exception('Failed to properly shutdown Scale scheduler.') status = 1 return status
class HpcFramework(object): messages = [] to_be_scheduled = [] scheduled = [] def fprint(self, msg): msg = str(datetime.now()) + " HPC: " + msg self.messages.append(msg) print(msg) class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: self.fprint('Stop requested by user, stopping framework....') running = False def __init__(self): self.fprint("init") def start(self): self.fprint("Start, connecting to:" + MESOS_MASTER) self.driver = None self.client = MesosClient(mesos_urls=[MESOS_MASTER]) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.client.frameworkName = "HPC Framework" self.th = HpcFramework.MesosFramework(self.client) self.th.start() while self.th.isAlive(): try: self.running = True self.th.join(1) except Exception as e: self.fprint("mesos framework exception" + str(e)) self.running = False self.fprint("mesos framework stopped") def shutdown(self): self.fprint("shutdown") self.driver.tearDown() self.client.stop = True self.stop = True def subscribed(self, driver): self.fprint("subscribed") self.driver = driver def status_update(self, update): stat = update['status'] state = stat['state'] job = next(x for x in self.scheduled if x['task_id']['value'] == stat['task_id']['value']) self.fprint( "status_update " + stat['task_id']['value'][:4] + " " + str(stat['state'])) # + " " + str(stat.get('message', {}))) if state == "TASK_STARTING": pass elif state == "TASK_RUNNING": pass elif state == "TASK_FINISHED": self.scheduled.remove(job) pending_jobs.append(job) elif state == "TASK_FAILED": pass elif state == "TASK_KILLED": pass elif state == "TASK_ERROR": pass elif state == "TASK_DROPPED": pass elif state == "TASK_UNREACHABLE": pass else: self.fprint("unkown task state" + state) def compatible_offer(self, job, offer): of = offer.get_offer() for reqres in job['resources']: if reqres['type'] == "SCALAR": for avres in (x for x in of['resources'] if x['name'] == reqres['name']): if reqres['scalar']['value'] > avres['scalar']['value']: self.fprint(reqres['name'] + ": " + str(reqres['scalar']['value']) + "incompatible with offer " + str(avres['scalar']['value'])) return False elif reqres['type'] == "RANGES": self.fprint("TODO: Check ranges requirements") else: self.fprint("Unkown/invalid resource type: " + str(reqres['type'])) return False return True def offer_received(self, offers): self.fprint("offer_received" + (str(offers))) for offer in offers: if len(self.to_be_scheduled) > 0 and self.compatible_offer( self.to_be_scheduled[0], offer): job = self.to_be_scheduled[0] self.fprint("Scheduling " + job['task_id']['value'][:4]) self.to_be_scheduled.remove(job) self.scheduled.append(job) self.run_job(job, offer) else: offer.decline() def run_job(self, job, mesos_offer): offer = mesos_offer.get_offer() self.fprint("run_job: " + job['task_id']['value'][:4] + " on " + offer['hostname']) job['agent_id']['value'] = offer['agent_id']['value'] mesos_offer.accept([job])
class Test(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') def __init__(self): logging.basicConfig() self.logger = logging.getLogger(__name__) #signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050']) #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos']) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break def shutdown(self): print('Stop requested by user, stopping framework....') self.logger.warn('Stop requested by user, stopping framework....') self.driver.tearDown() self.client.stop = True self.stop = True def subscribed(self, driver): self.logger.warn('SUBSCRIBED') self.driver = driver def status_update(self, update): if update['status']['state'] == 'TASK_RUNNING': self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value']) def offer_received(self, offers): self.logger.warn('OFFER: %s' % (str(offers))) i = 0 for offer in offers: if i == 0: self.run_job(offer) else: offer.decline() i+=1 def run_job(self, mesos_offer): offer = mesos_offer.get_offer() print(str(offer)) task = { 'name': 'sample test', 'task_id': {'value': uuid.uuid4().hex}, 'agent_id': {'value': offer['agent_id']['value']}, 'resources': [ { 'name': 'cpus', 'type': 'SCALAR', 'scalar': {'value': 1} }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': {'value': 1000} } ], 'command': {'value': 'sleep 30'}, 'container': { 'type': 'MESOS', 'mesos': { 'image': { 'type': 'DOCKER', 'docker': {'name': 'debian'} } } } } mesos_offer.accept([task])
class Test(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') def __init__(self): logging.basicConfig() self.logger = logging.getLogger(__name__) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.client = MesosClient(mesos_urls=['http://52.87.159.219:5050']) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break def shutdown(self): print('Stop requested by user, stopping framework....') self.logger.warn('Stop requested by user, stopping framework....') self.driver.tearDown() self.client.stop = True self.stop = True def subscribed(self, driver): self.logger.warn('SUBSCRIBED') self.driver = driver def status_update(self, update): a = 1 # if update['status']['state'] == 'TASK_RUNNING': # self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value']) def offer_received(self, offers): self.logger.warn('OFFER: %s' % (str(offers))) i = 0 for offer in offers: if i == 0: self.run_job(offer) else: offer.decline() i += 1 def run_job(self, mesos_offer): offer = mesos_offer.get_offer() print(str(offer)) task = { 'name': 'sample test', 'task_id': { 'value': uuid.uuid4().hex }, 'agent_id': { 'value': offer['agent_id']['value'] }, 'resources': [{ 'name': 'cpus', 'type': 'SCALAR', 'scalar': { 'value': 1 } }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': { 'value': 64 } }], 'command': { 'value': "curl -X POST 'https://hooks.slack.com/services/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' -H 'content-type: application/json; charset=UTF-8' -d '{\"text\" : \"Jan D.\"}'" }, 'container': { 'type': 'DOCKER', 'docker': { 'image': 'tutum/curl' } } } mesos_offer.accept([task])
class ESPAFramework(object): def __init__(self, cfg, espa_api, worklist): master = cfg.get('mesos_master') principal = cfg.get('mesos_principal') secret = cfg.get('mesos_secret') self.workList = worklist self.runningList = {} self.max_cpus = cfg.get('max_cpu') self.required_cpus = cfg.get('task_cpu') self.required_memory = cfg.get('task_mem') self.required_disk = cfg.get('task_disk') self.task_image = cfg.get('task_image') self.refuse_seconds = cfg.get('offer_refuse_seconds') self.request_count = cfg.get('product_request_count') self.products = cfg.get('product_frequency') self.healthy_states = [ "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED" ] self.espa = espa_api self.cfg = cfg self.client = MesosClient(mesos_urls=[master], frameworkName='ESPA Mesos Framework') self.client.verify = False self.client.set_credentials(principal, secret) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) # put some work on the queue get_products_to_process(cfg, self.espa, self.workList) def _getResource(self, res, name): for r in res: if r['name'] == name: return r['scalar']['value'] return 0.0 def _updateResource(self, res, name, value): if value <= 0: return for r in res: if r['name'] == name: r['scalar']['value'] -= value return def subscribed(self, driver): log.warning('SUBSCRIBED') self.driver = driver def core_limit_reached(self): running_count = len(self.runningList) task_core_count = self.required_cpus core_utilization = running_count * task_core_count resp = False log.debug("Number of cores being used: {}".format(core_utilization)) if core_utilization >= self.max_cpus: log.debug("Max number of cores being used. Max = {}".format( self.max_cpus)) resp = True return resp def accept_offer(self, offer): accept = True resources = offer.get('resources') if self.required_cpus != 0: cpu = self._getResource(resources, "cpus") if self.required_cpus > cpu: accept = False if self.required_memory != 0: mem = self._getResource(resources, "mem") if self.required_memory > mem: accept = False if self.required_disk != 0: disk = self._getResource(resources, "disk") if self.required_disk > disk: accept = False if (accept == True): self._updateResource(resources, "cpus", self.required_cpus) self._updateResource(resources, "mem", self.required_memory) self._updateResource(resources, "disk", self.required_disk) return accept def decline_offer(self, offer): options = {'filters': {'refuse_seconds': self.refuse_seconds}} log.debug("declining offer: {} with options: {}".format( offer, options)) try: offer.decline(options) except Exception as error: log.error( "Exception encountered declining offer: {}, error: {}".format( offer, error)) raise return True def offer_received(self, offers): response = addict.Dict() response.offers.length = len(offers) response.offers.accepted = 0 log.debug("Received {} new offers...".format(response.offers.length)) # check to see if Mesos tasks are enabled if self.espa.mesos_tasks_disabled(): # decline the offers to free up the resources log.debug("mesos tasks disabled, declining {} offers".format( len(offers))) for offer in offers: self.decline_offer(offer) response.tasks.enabled = False return response else: response.tasks.enabled = True # check to see if core limit has been reached if self.core_limit_reached(): # decline the offers to free up the resources log.debug( "Core utilization limit reached, declining {} offers".format( len(offers))) for offer in offers: self.decline_offer(offer) response.tasks.enabled = False return response else: response.tasks.enabled = True for offer in offers: mesos_offer = offer.get_offer() if self.accept_offer(mesos_offer): log.debug("Acceptable offer, checking for work to do") try: work = self.workList.get( False ) # will raise multiprocessing.Empty if no objects present orderid = work.get('orderid') scene = work.get('scene') task_id = "{}_@@@_{}".format(orderid, scene) new_task = task.build(task_id, mesos_offer, self.task_image, self.required_cpus, self.required_memory, self.required_disk, work, self.cfg) log.debug("New Task definition: {}".format(new_task)) offer.accept([new_task]) self.espa.update_status(scene, orderid, 'tasked') response.offers.accepted += 1 except Empty: log.debug("Work queue is empty, declining offer") self.decline_offer(offer) except Exception as e: log.error( "Exception creating new task. offer: {}, exception: {}\n declining offer" .format(offer, e)) self.decline_offer(offer) else: log.debug("Unacceptable offer, declining") self.decline_offer(offer) log.debug("resourceOffer response: {}".format(response)) return response def status_update(self, update): # possible state values # http://mesos.apache.org/api/latest/java/org/apache/mesos/Protos.TaskState.html task_id = update['status']['task_id']['value'] orderid, scene = task_id.split("_@@@_") state = update['status']['state'] response = addict.Dict() response.task_id = task_id response.state = state if state in self.healthy_states: log.debug("status update for: {} new status: {}".format( task_id, state)) response.status = "healthy" if state == "TASK_RUNNING": response.list.name = "running" if task_id not in self.runningList: self.runningList[task_id] = util.right_now() response.list.status = "new" else: response.list.status = "current" if state == "TASK_FINISHED": try: self.runningList.__delitem__(task_id) except KeyError: log.debug( "Received TASK_FINISHED update for {}, which wasn't in the runningList" .format(task_id)) else: # something abnormal happened log.error("abnormal task state for: {}, full update: {}".format( task_id, update)) response.status = "unhealthy" self.espa.set_scene_error(scene, orderid, update) if task_id in self.runningList: self.runningList.__delitem__(task_id) return response
class HpcpackFramwork(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') def __init__(self, script_path, setup_path, headnode, ssl_thumbprint, client_cert, node_group=""): logging.basicConfig() self.logger = logging_aux.init_logger_aux("hpcframework", "hpcframework.log") # signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.node_idle_check_table = {} self.script_path = script_path self.setup_path = setup_path self.headnode = headnode self.ssl_thumbprint = ssl_thumbprint self.node_group = node_group self.hpc_client = HpcRestClient(client_cert) self.heartbeat_table = hpc_cluster_manager.HpcClusterManager( self.hpc_client, node_group=self.node_group) self.heartbeat_table.subscribe_node_closed_callback( lambda l: map(self._kill_task_by_hostname, l)) self.heartbeat_table.start() self.core_provisioning = 0.0 self.driver = None # type: MesosClient.SchedulerDriver framework_suffix = self.headnode.replace(',', '_') if self.node_group != "": framework_suffix = framework_suffix + '-' + self.node_group self.mesos_client = MesosClient( mesos_urls=['http://172.16.1.4:5050'], # mesos_urls=['zk://127.0.0.1:2181/mesos'], frameworkName="HPC-Pack-Framework-{}".format(framework_suffix)) self.mesos_client.on(MesosClient.SUBSCRIBED, self.subscribed) self.mesos_client.on(MesosClient.OFFERS, self.offer_received) self.mesos_client.on(MesosClient.UPDATE, self.status_update) self.th = HpcpackFramwork.MesosFramework(self.mesos_client) self.stop = False def start(self): self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break def __encode_utf16b64(self, content): utf16 = content.encode('utf-16') utf16_nobom = utf16[2:] if utf16[0:2] == codecs.BOM_UTF16 else utf16 utf16_b64 = base64.b64encode(utf16_nobom) return utf16_b64 def shutdown(self): print 'Stop requested by user, stopping framework....' self.logger.warn('Stop requested by user, stopping framework....') self.driver.tearDown() self.mesos_client.stop = True self.stop = True def subscribed(self, driver): self.logger.warn('SUBSCRIBED') self.driver = driver def status_update(self, update): # if update['status']['state'] == 'TASK_RUNNING': # self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value']) self.logger.info("Update received:\n{}".format(str(update))) def offer_received(self, offers): handled_offer = [] # type: List[Offer] try: # self.logger.info('OFFER: %s' % (str(offers))) if self.node_group == "": grow_decision = self.hpc_client.get_grow_decision() else: grow_decision = self.hpc_client.get_grow_decision( self.node_group) if grow_decision is None: cores_to_grow = 0 cores_in_provisioning = 0 else: cores_in_provisioning = self.heartbeat_table.get_cores_in_provisioning( ) cores_to_grow = grow_decision.cores_to_grow - cores_in_provisioning for offer in offers: # type: Offer take_offer = False cpus = 0.0 if cores_to_grow > 0: offer_dict = offer.get_offer() self.logger.info( "cores_to_grow: {}, cores_in_provisioning: {}, offer_received: {}" .format(cores_to_grow, cores_in_provisioning, (str(offer_dict)))) if 'attributes' in offer_dict: attributes = offer_dict['attributes'] if get_text(attributes, 'os') == 'windows_server': match_node_group = False if self.node_group == "": match_node_group = True elif get_text(attributes, 'node_group').upper( ) == self.node_group.upper(): match_node_group = True else: match_node_group = False if match_node_group: cores = get_scalar(attributes, 'cores') cpus = get_scalar(offer_dict['resources'], 'cpus') # work around of MESOS-8631 if cpus >= cores - 0.1: if not self.heartbeat_table.check_fqdn_collision( offer_dict['hostname']): take_offer = True if take_offer: cores_to_grow -= cpus self.accept_offer(offer) handled_offer.append(offer) else: self.decline_offer(offer) handled_offer.append(offer) except (KeyboardInterrupt, SystemExit): raise except Exception as ex: self.logger.exception(ex) finally: # We have to either accept or decline an offer # TODO: Retry in a separate thread while True: try: if not offers: return else: to_decline = [ offer for offer in offers if offer not in handled_offer ] for offer in to_decline: self.decline_offer(offer) handled_offer.append(offer) return except Exception as ex: self.logger.exception(ex) time.sleep(10) def decline_offer(self, offer): self.logger.info("Decline offer %s" % offer.get_offer()['id']['value']) offer.decline() def accept_offer(self, offer): self.logger.info("Offer %s meets HPC's requirement" % offer.get_offer()['id']['value']) offer_dict = offer.get_offer() self.logger.info("Accepting offer: {}".format(str(offer_dict))) agent_id = offer_dict['agent_id']['value'] fqdn = offer_dict['hostname'] task_id = uuid.uuid4().hex cpus = get_scalar(offer_dict['resources'], 'cpus') # work around of MESOS-8631 if 'attributes' in offer_dict: attributes = offer_dict['attributes'] cores = get_scalar(attributes, 'cores') else: cores = cpus task = { 'name': 'hpc pack mesos cn', 'task_id': { 'value': task_id }, 'agent_id': { 'value': agent_id }, 'resources': [ { 'name': 'cpus', 'type': 'SCALAR', # work around of MESOS-8631 'scalar': { 'value': cores - 0.1 } }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': { 'value': get_scalar(offer_dict['resources'], 'mem') } } ], 'command': { 'value': 'powershell -File ' + self.script_path + " -setupPath " + self.setup_path + " -headnode " + self.headnode + " -sslthumbprint " + self.ssl_thumbprint + " > setupscript.log" } } self.logger.debug("Sending command:\n{}".format( task['command']['value'])) offer.accept([task]) self.heartbeat_table.add_slaveinfo(fqdn, agent_id, task_id, cpus) def _kill_task(self, host): self.logger.debug("Killing task {} on host {}".format( host.task_id, host.fqdn)) self.driver.kill(host.agent_id, host.task_id) def _kill_task_by_hostname(self, hostname): (task_id, agent_id) = self.heartbeat_table.get_task_info(hostname) if task_id != "": self.logger.debug("Killing task {} on host {}".format( task_id, hostname)) self.driver.kill(agent_id, task_id) else: self.logger.warn( "Task info for host {} not found".format(hostname))
class Test(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.exited = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') print("MesosFramework: run end") self.exited = True def is_done(self): return self.exited def __init__(self, url="http://127.0.0.1:5050", name="HTTP framework", user="******"): logging.basicConfig() self.logger = logging.getLogger(__name__) self.tasks = 0 #signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.mesos_offer = None # to store only one offer self.update = None self.task_id = None self.agent_id = None self.client = MesosClient(mesos_urls = [url], frameworkName = name, frameworkUser = user) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) def reset_offer(self): ret = copy.deepcopy(self.mesos_offer) self.mesos_offer = None return ret def reset_update(self): ret = copy.deepcopy(self.update) self.update = None return ret def start(self): self.th.start() def is_done(self): return self.th.is_done() def wait_offer(self, sec): cnt = 0 while self.mesos_offer is None and cnt < sec: print("Waiting for offer: %d" % cnt) cnt += 1 gevent.sleep(1) if self.mesos_offer is None: print("Timeout waiting for offer") return self.mesos_offer # return self.wait4(self.mesos_offer, sec, "offer") def wait_update(self, sec): cnt = 0 while self.update is None and cnt < sec: print("Waiting for update %d" % cnt) cnt += 1 gevent.sleep(1) if self.update is None: print("Timeout waiting for update") return self.update # return self.wait4(self.update, sec, "update") def wait4(self, obj, sec, msg): cnt = 0 while obj is None and cnt < sec: print("Waiting for %s %s %d" % (msg, repr(obj), cnt)) cnt += 1 gevent.sleep(1) if obj is None: print("Timeout waiting for: %s" % msg) return obj def shutdown(self): print('Stop requested by user, stopping framework....') self.driver.tearDown() self.client.stop = True def subscribed(self, driver): print('SUBSCRIBED') self.driver = driver def status_update(self, update): print("STATUS UPDATE: %s" % update['status']['state']) if self.update is None: self.update = update self.task_id = update['status']['task_id'] self.agent_id = update['status']['agent_id'] print("Use update: %s" % self.update) # if update['status']['state'] == 'TASK_RUNNING': # self.tasks+=1 # self.logger.warn("Task %s (%d/%d): TASK_RUNNING" % (update['status']['agent_id']['value'],self.tasks,max_tasks)) # self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value']) # if self.tasks >= max_tasks: # self.logger.warn("Max tasks %d lauched, shutdown now" % self.tasks) # self.shutdown() # elif update['status']['state'] == 'TASK_FINISHED': # self.logger.warn("Task %s: TASK_FINISHED" % update['status']['agent_id']['value']) # self.tasks+=1 # if self.tasks >= self.max_tasks: # self.shutdown() # else: # self.logger.warn("Status update: %s" % update['status']['state']) def offer_received(self, offers): print("OFFERS: %s" % (str(offers))) i = 0 for offer in offers: if i == 0 and self.mesos_offer is None: self.mesos_offer = offer print("Use offer: %s" % self.mesos_offer.get_offer()) else: print("Declining offer: %s" % offer.get_offer()) offer.decline() i+=1 def run_task(self, sec, o = None): offer = o if o else self.mesos_offer.get_offer() print("Run task on offer: %s" % str(offer)) task = { 'name': 'sample test', 'task_id': {'value': uuid.uuid4().hex}, 'agent_id': {'value': offer['agent_id']['value']}, 'resources': [ { 'name': 'cpus', 'type': 'SCALAR', 'scalar': {'value': 1} }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': {'value': 1000} } ], 'command': {'value': 'sleep ' + str(sec)}, } self.mesos_offer.accept([task]) def kill_task(self, task_name = None, agent_name = None): t = task_name if task_name else self.update['status']['task_id']['value'] a = agent_name if agent_name else self.update['status']['agent_id']['value'] print("kill task: %s on %s" % (t, a)) self.driver.kill(a, t) def send_message(self, msg, agent_id = None, executor_id = None): a = agent_id if agent_id else self.update['agent_id'] e = task_name if task_name else self.update['executor_id'] print("Send message: %s" % msg) self.driver.message(a, e, msg) def reconcile(self): print("Reconcile") task = { 'task_id' : self.task_id, 'agent_id' : self.agent_id } self.driver.reconcile([task])
class Test(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') def __init__(self): logging.basicConfig() self.logger = logging.getLogger(__name__) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None # Note: leader.mesos address requires Mesos DNS #self.client = MesosClient(mesos_urls=['zk://leader.mesos:2181/mesos']) # If you are purely using Mesos, you should use explicit address of Master # Example: Zookeeper master discovery #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos']) # Example: Directly address Mesos #self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050']) # By default, use direct master addressing # Allow for comma delimited URLs to be passed in via MASTER_URLS # environment variable master_urls = os.getenv('MESOS_URLS', 'http://127.0.0.1:5050') self.client = MesosClient(mesos_urls=master_urls.split(',')) secret = os.getenv('SERVICE_SECRET') if secret: self.client.set_service_account(json.loads(secret)) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break def shutdown(self): print('Stop requested by user, stopping framework....') self.logger.warn('Stop requested by user, stopping framework....') self.client.stop = True self.driver.tearDown() self.stop = True def subscribed(self, driver): self.logger.warn('SUBSCRIBED') self.driver = driver def status_update(self, update): if update['status']['state'] == 'TASK_RUNNING': self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value']) def offer_received(self, offers): self.logger.warn('OFFER: %s' % (str(offers))) i = 0 for offer in offers: if i == 0: self.run_job(offer) else: offer.decline() i += 1 def run_job(self, mesos_offer): offer = mesos_offer.get_offer() print(str(offer)) task = { 'name': 'sample test', 'task_id': { 'value': uuid.uuid4().hex }, 'agent_id': { 'value': offer['agent_id']['value'] }, 'resources': [{ 'name': 'cpus', 'type': 'SCALAR', 'scalar': { 'value': 1 } }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': { 'value': 1000 } }], 'command': { 'value': 'sleep 30' }, 'container': { 'type': 'MESOS', 'mesos': { 'image': { 'type': 'DOCKER', 'docker': { 'name': 'debian' } } } } } mesos_offer.accept([task])
class Test(object): class MesosFramework(threading.Thread): def __init__(self, client): threading.Thread.__init__(self) self.client = client self.stop = False def run(self): try: self.client.register() except KeyboardInterrupt: print('Stop requested by user, stopping framework....') def __init__(self, url="http://127.0.0.1:5050"): logging.basicConfig() self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) self.launched_tasks = 0 self.running_tasks = 0 self.finished_tasks = 0 self.killed_tasks = 0 self.lost_tasks = [] # self.failed_tasks = [] #signal.signal(signal.SIGINT, signal.SIG_IGN) logging.getLogger('mesoshttp').setLevel(logging.DEBUG) self.driver = None self.client = MesosClient(mesos_urls=[url], frameworkId=None, frameworkName='Python HTTP framework', frameworkUser=getpass.getuser()) self.client.on(MesosClient.SUBSCRIBED, self.subscribed) self.client.on(MesosClient.OFFERS, self.offer_received) self.client.on(MesosClient.UPDATE, self.status_update) self.th = Test.MesosFramework(self.client) self.th.start() while True and self.th.isAlive(): try: self.th.join(1) except KeyboardInterrupt: self.shutdown() break def shutdown(self): print('Stop requested by user, stopping framework....') self.logger.info('Stop requested by user, stopping framework....') self.driver.tearDown() self.client.stop = True self.stop = True def subscribed(self, driver): self.logger.info('SUBSCRIBED') self.driver = driver def status_update(self, update): status = update['status']['state'] task = update['status']['task_id']['value'] self.logger.info("Task %s: %s" % (task, status)) if status == 'TASK_RUNNING': self.running_tasks += 1 if not self.running_tasks % 3: # if not self.running_tasks%3 and task not in self.failed_tasks: self.logger.info("Killing task: %s" % task) self.driver.kill(update['status']['agent_id']['value'], task) elif status == 'TASK_FINISHED': self.finished_tasks += 1 if self.finished_tasks >= max_tasks: self.logger.info("All %d tasks finished, shutdown now" % self.finished_tasks) self.shutdown() elif status == 'TASK_LOST': self.lost_tasks.append(task) self.logger.info("Reconcile on lost task") self.driver.reconcile([]) elif status == 'TASK_FAILED': # self.failed_tasks.append(task) self.logger.info("Reconcile on failed task") self.driver.reconcile([]) else: self.logger.info("Status update: %s" % update['status']['state']) self.logger.info("(l%d/f%d/r%d/m%d)" % (self.launched_tasks, self.finished_tasks, self.running_tasks, max_tasks)) def offer_received(self, offers): self.logger.info('OFFER: %s' % (str(offers))) i = 0 if len(self.lost_tasks) > 0: task_name = self.lost_tasks.pop(0) print("Relaunching lost task: %s" % task_name) else: task_name = uuid.uuid4().hex for offer in offers: if i == 0: self.run_job(offer, task_name) else: offer.decline() i += 1 # def run_job(self, mesos_offer, task_name = uuid.uuid4().hex): def run_job(self, mesos_offer, task_name): offer = mesos_offer.get_offer() cmd = "sleep 3" if self.launched_tasks % 4 else 'sleep 2; exit 1' # print(str(offer)) task = { 'name': 'sample http task', 'task_id': { 'value': task_name }, 'agent_id': { 'value': offer['agent_id']['value'] }, 'resources': [{ 'name': 'cpus', 'type': 'SCALAR', 'scalar': { 'value': 0.1 } }, { 'name': 'mem', 'type': 'SCALAR', 'scalar': { 'value': 100 } }], 'command': { 'value': cmd }, } mesos_offer.accept([task]) self.launched_tasks += 1