def rolling_replace_app(service_name, app1_id, app2_id, app2_config, labels): print ' replacing '+app1_id+' with '+app2_id marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port)) app1 = marathon_client.get_app(app1_id) old_tasks = app1.tasks # launcher.launch(group2.service.name, group2.encode_marathon_id, group2.config, instances = 0) launcher.launch_app(service_name, app2_id, app2_config, labels, instances = 0 ) new_app = marathon_client.get_app(app2_id) for old_task in old_tasks: # # replace each old task with a new task of the new app # num_started = num_started_tasks(app2_id) new_instances = num_started+1 # add 1 instance of new task launcher.update_app(app2_id, app2_config, new_instances) while num_started < new_instances: time.sleep(1) print 'waiting for app to start '+str(num_started) num_started = num_started_tasks(app2_id) # # take down old task # marathon_client.kill_task(app1_id, old_task.id, scale=True) marathon_client.delete_app(app1_id)
def launch_elsa(marathon, stats_file, scale_window): logging.info('Start monitoring the inbound traffic on topics using %s' % (stats_file)) # make sure the stats file is properly initialized: if not os.path.exists(stats_file): f = open(stats_file, 'w') f.write('0') f.close() # launch the Elsa app via Marathon c = MarathonClient(marathon) c.create_app( 'elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='******')) # c.list_apps() print( 'ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...' ) time.sleep(5) # allow time to deploy before autoscaling sets in # kick off traffic monitoring and trigger autoscaling: previous_topic_traffic = 0 try: while True: with open(stats_file, 'r') as elsa_file: topic_traffic = int(elsa_file.read()) topic_traffic_diff = topic_traffic - previous_topic_traffic print('Difference in traffic in the past %d seconds: %d' % (scale_window, topic_traffic_diff)) previous_topic_traffic = topic_traffic current_instance_num = c.get_app('elsa').instances if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ... instance_multiplier = int( topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances c.scale_app('elsa', current_instance_num * instance_multiplier) print('Increasing number of instances to %d' % (current_instance_num * instance_multiplier)) elif topic_traffic_diff < 0: # negative, back off exponentially target_instance_num = int(current_instance_num / 2) if target_instance_num > 1: c.scale_app('elsa', target_instance_num) print('Decreasing number of instances to %d' % (target_instance_num)) else: c.scale_app('elsa', 1) print('Resetting number of instances to 1') time.sleep(scale_window) except KeyboardInterrupt: print( 'ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!' ) c.delete_app('elsa', force=True)
class MarathonDeployer(object): def __init__(self, marathon_url): self.url = marathon_url self.client = MarathonClient(self.url) def deploy(self, task_chain, environment_name): deployed_chain = DeployedTaskChain(task_chain, environment_name) for task in deployed_chain.list_all_tasks(): task_id = task['id'] safe_name = task_id.lower() # safe_name = task['name'].replace('.', '').lower() try: if self.client.get_app(safe_name): self.client.delete_app(safe_name) time.sleep(2) except Exception: pass app = MarathonApp(cmd='/var/riversnake/invoke.py {0} {1} {2}'.format( task_chain.flow_name, environment_name, task_id), mem=16, cpus=1) self.client.create_app(safe_name, app)
def send_to_marathon(request): try: if request.method == 'POST': action = request.POST.get('action', None) id = request.POST.get('id', None) mc = MarathonClient('http://{}:{}'.format(settings.MARATHON['host'], settings.MARATHON['port'])) if action == 'stop': mc.scale_app(id, 0, force=True) elif action == 'start': mc.scale_app(id, 1) elif action == 'destroy': if request.user.has_perm("auth.can_init_app"): mc.delete_app(id) else: raise PermissionDenied elif action == 'restart': mc.restart_app(id) elif action == 'scale': mc.scale_app(id, int(request.POST.get('number_instance'))) elif action == 'update': app = mc.get_app(id) app.cpus = float(request.POST.get('cpus')) app.mem = float(request.POST.get('mem')) app.container.docker.image = request.POST.get('version') mc.update_app(id, app) elif action == "stop-deployment": mc.delete_deployment(id) result = '{"status":"success", "msg": "%(action)s success"}'%{"action":action} except Exception as e: result = '{"status":"error", "msg": "%(action)s fail: %(error)s" }'%{"action":action, "error": html.escape(str(e))} return HttpResponse(result)
def servermarathon(): APP = os.environ['APPNAME'] # STRING = os.environ['STRING'] # content = STRING # contentlist = content.split('&') # list = [] # for i in contentlist: # p = i.split('=') # p = p[1] # l = list.append(p) # (maurl,mau,map) =tuple(list) # marathonip = maurl # user = mau # password = map c = MarathonClient(marathonip,username=user,password=password) buildFile=open('build.txt','r') dockerimage = buildFile.readline() buildFile.close() readed = json.load(open('temp.json', 'r')) readed['container']['docker']['image'] = dockerimage readed['id'] = APP json.dump(readed, open('app.json', 'w')) try: c.delete_app(APP,force=True) print 'delete' except : pass sleep(3) u= user+':'+password cmd1 = os.system ('curl -u %s -X POST -H "Content-Type: application/json" %s/v2/apps [email protected]' %(u,marathonip))
class MarathonCluster(object): def __init__(self, scheduler, executable='dask-worker', docker_image='mrocklin/dask-distributed:1.15.2', marathon_address='http://localhost:8080', name=None, **kwargs): self.scheduler = scheduler self.executor = ThreadPoolExecutor(1) # Create Marathon App to run dask-worker args = [executable, scheduler.address, '--name', '$MESOS_TASK_ID', # use Mesos task ID as worker name '--worker-port', '$PORT_WORKER', '--bokeh-port', '$PORT_BOKEH', '--nanny-port', '$PORT_NANNY', '--http-port', '$PORT_HTTP'] ports = [{'port': 0, 'protocol': 'tcp', 'name': name} for name in ['worker', 'nanny', 'http', 'bokeh']] if 'mem' in kwargs: args.extend(['--memory-limit', str(int(kwargs['mem'] * 0.6 * 1e6))]) kwargs['cmd'] = ' '.join(args) container = MarathonContainer({'image': docker_image}) app = MarathonApp(instances=0, container=container, port_definitions=ports, **kwargs) # Connect and register app self.client = MarathonClient(marathon_address) self.app = self.client.create_app(name or 'dask-%s' % uuid.uuid4(), app) def scale_up(self, instances): self.executor.submit(self.client.scale_app, self.app.id, instances=instances) def scale_down(self, workers): for w in workers: self.executor.submit(self.client.kill_task, self.app.id, self.scheduler.worker_info[w]['name'], scale=True) def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): self.client.delete_app(self.app.id, force=True)
def launch_elsa(marathon, stats_file, scale_window): logging.info('Start monitoring the inbound traffic on topics using %s' %(stats_file)) # make sure the stats file is properly initialized: if not os.path.exists(stats_file): f = open(stats_file, 'w') f.write('0') f.close() # launch the Elsa app via Marathon c = MarathonClient(marathon) c.create_app('elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='******')) # c.list_apps() print('ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...') time.sleep(5) # allow time to deploy before autoscaling sets in # kick off traffic monitoring and trigger autoscaling: previous_topic_traffic = 0 try: while True: with open(stats_file, 'r') as elsa_file: topic_traffic = int(elsa_file.read()) topic_traffic_diff = topic_traffic - previous_topic_traffic print('Difference in traffic in the past %d seconds: %d' %(scale_window, topic_traffic_diff)) previous_topic_traffic = topic_traffic current_instance_num = c.get_app('elsa').instances if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ... instance_multiplier = int(topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances c.scale_app('elsa', current_instance_num * instance_multiplier) print('Increasing number of instances to %d' %(current_instance_num * instance_multiplier)) elif topic_traffic_diff < 0: # negative, back off exponentially target_instance_num = int(current_instance_num/2) if target_instance_num > 1: c.scale_app('elsa', target_instance_num) print('Decreasing number of instances to %d' %(target_instance_num)) else: c.scale_app('elsa', 1) print('Resetting number of instances to 1') time.sleep(scale_window) except KeyboardInterrupt: print('ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!') c.delete_app('elsa', force=True)
def undeploy(app_name): """Calls marathon API to undeploy application :param app_name: :return: """ marathon_addresses = _addresses() cli = MarathonClient(marathon_addresses) if _is_deployed(cli, app_name): return cli.delete_app(app_name) else: return None
def do_full_rollback(client: MarathonClient, rollback: list): print('------------------\nPerforming rollback in order:') print('\n'.join(rollback)) print('------------------') for each in rollback: if os.path.isfile(each): with open(each) as json_file: app = MarathonApp.from_json(json.load(json_file)) _update_application(client, app, each, False) else: deployment = client.delete_app(each, True) wait_for_deployment(client, deployment)
def send_to_marathon(request): try: if request.method == "POST": action = request.POST.get("action", None) app_id = request.POST.get("id", None) mc = MarathonClient("http://{}:{}".format(settings.MARATHON["host"], settings.MARATHON["port"])) if action == "stop": mc.scale_app(app_id, 0) elif action == "start": mc.scale_app(app_id, 1) elif action == "destroy": mc.delete_app(app_id) elif action == "restart": pass elif action == "scale": mc.scale_app(app_id, int(request.POST.get("number_instance"))) result = '{"status":"success", "msg": "%(action)s success"}' % {"action": action} except Exception as e: result = '{"status":"error", "msg": "%(action)s fail: %(error)s" }' % { "action": action, "error": html.escape(str(e)), } return HttpResponse(result)
class TestCreateApp(unittest.TestCase): """ Test the creation of a Marathon app against a live endpoint. Configure MARATHON_SERVER in tests.config. """ def setUp(self): self._app = get_app() # Generate a random server configuration. self.client = MarathonClient(MARATHON_SERVER) self.client.create_app(app_id=self._app.id, app=self._app) time.sleep(2) # Wait two seconds for the POST to be processed by Marathon. self.app = self.client.get_app(self._app.id) while not self.app.tasks_healthy: # Wait until the app becomes healthy. self.app = self.client.get_app(self._app.id) time.sleep(1) def test_create(self): self.assertIsInstance(self.app, MarathonApp) self.assertIsInstance(self.app.upgrade_strategy, MarathonUpgradeStrategy) self.assertIsInstance(self.app.tasks.pop(), MarathonTask) self.assertIsInstance(self.app.health_checks.pop(), MarathonHealthCheck) def tearDown(self): self.client.delete_app(self.app.id, force=True)
def send_to_marathon(request): try: if request.method == 'POST': action = request.POST.get('action', None) id = request.POST.get('id', None) mc = MarathonClient('http://{}:{}'.format( settings.MARATHON['host'], settings.MARATHON['port'])) if action == 'stop': mc.scale_app(id, 0, force=True) elif action == 'start': mc.scale_app(id, 1) elif action == 'destroy': if request.user.has_perm("auth.can_init_app"): mc.delete_app(id) else: raise PermissionDenied elif action == 'restart': mc.restart_app(id) elif action == 'scale': mc.scale_app(id, int(request.POST.get('number_instance'))) elif action == 'update': app = mc.get_app(id) app.cpus = float(request.POST.get('cpus')) app.mem = float(request.POST.get('mem')) app.container.docker.image = request.POST.get('version') mc.update_app(id, app) elif action == "stop-deployment": mc.delete_deployment(id) result = '{"status":"success", "msg": "%(action)s success"}' % { "action": action } except Exception as e: result = '{"status":"error", "msg": "%(action)s fail: %(error)s" }' % { "action": action, "error": html.escape(str(e)) } return HttpResponse(result)
class Services(object): def __init__(self, endpoints): self.marathon = MarathonClient(endpoints) def list(self): return self.marathon.list_apps() def clean(self, pattern=None): apps = self.list() for app in apps: try: if pattern == None or re.match(pattern, app.id) != None: logging.info("Deleting app: %s", app.id) self.marathon.delete_app(app.id, force=True) else: logging.info("Ignoring app %s. Did not match pattern %s", app.id, pattern) except: logger.info("Unable to delete app %s", app.id) traceback.print_exc() def register_services(self, service_registry="conf/marathon"): for app_def in glob.glob(os.path.join(service_registry, "*json")): with open(app_def, "r") as stream: args = json.loads(stream.read()) app_id = args['id'] args = Names.snake_case(args) logger.debug("Creating service: %s", json.dumps(args, indent=2)) args['tasks'] = [] app = MarathonApp(**args) try: logging.info("Creating app [id=>{0}]".format(app_id)) self.marathon.create_app(app_id, app) except: traceback.print_exc()
class HealthCheckBencher(object): def __init__(self, marathon_url, image, tasks): self.concurrency = 20 self.docker_image = image self.app_base_name = 'health-check-test-' self.total_tasks_cout = int(tasks) self.instances_per_app = 50 if tasks < self.instances_per_app: self.instances_per_app = self.total_tasks_cout self.app_count = 1 else: self.app_count = self.total_tasks_cout/self.instances_per_app self.heath_check_interval = 30 self.test_duration = 20 self.marathon_cluster = MarathonClient(marathon_url, timeout=240) self.work_queue = Queue() self.result_queue = Queue() self.app_list_queue = Queue() self.action_list = [self.start_collect, 'sleep={}'.format(self.test_duration), self.get_stats] def remove_apps(self): apps = self.marathon_cluster.list_apps() for app in apps: if app.id.startswith("/"+self.app_base_name): self.marathon_cluster.delete_app(app.id) active = 0 while True: apps = self.marathon_cluster.list_apps() for app in apps: if app.id.startswith(self.app_base_name): active += 1 if active == 0: break def create_app(self, id): port_mapping = MarathonContainerPortMapping(container_port=80, protocol="tcp") app_docker = MarathonDockerContainer( image=self.docker_image, network="BRIDGE", force_pull_image=True, port_mappings=[port_mapping]) app_container = MarathonContainer(docker=app_docker) http_health_check = MarathonHealthCheck( protocol="HTTP", path="/status", grace_period_seconds=300, interval_seconds=self.heath_check_interval, timeout_seconds=20, max_consecutive_failures=0 ) app_suffix = str(md5(str(random())).hexdigest()) app_name = self.app_base_name + app_suffix new_app = MarathonApp(cpus=CPUS, mem=MEM, disk=DISK, container=app_container, health_checks=[http_health_check], instances=self.instances_per_app, max_launch_delay_seconds=5) print("Creating {}".format(app_name)) self.marathon_cluster.create_app(app_id=app_name, app=new_app) self.app_list_queue.put(app_name) return None def wait_instances(self, app_name): health_ok = 0 while health_ok < self.instances_per_app: health_ok = 0 tasks = self.marathon_cluster.list_tasks(app_name) for task in tasks: if task.health_check_results: health_ok += 1 def start_collect(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/start_collect' res = urlopen(url) if res.getcode() == 200: print(task['id']+': collecter was started') else: print(task['id']+': failed to start collecter') def stop_collect(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/stop_collect' res = urlopen(url) if res.getcode() == 200: print(task['id']+': collecter was stopped') else: print(task['id']+': failed to stop collecter') def clear_stats(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/clear_stats' res = urlopen(url) if res.getcode() == 200: print(task['id']+': stats was dropped') else: print(task['id']+': stats was dropped') def get_stats(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/get_timestamps' try: res = urlopen(url) except Exception: print("URL req failed") self.result_queue.put({'id': task['id'], 'status': 'Failed', 'data': []}) return if res.getcode() == 200: data = res.read() timestamps = data.split(',') self.result_queue.put({'id': task['id'], 'status': 'ok', 'data': timestamps}) elif res.getcode() == 202: print("Collecting is not enabled") self.result_queue.put({'id': task['id'], 'status': 'Collecting is not enabled', 'data': []}) else: print("Unknown response code") self.result_queue.put({'id': task['id'], 'status': 'Unknown response code', 'data': []}) def repeat(self, action): while self.work_queue.empty() is False: try: iteration = self.work_queue.get_nowait() except Empty: continue action(iteration) self.work_queue.task_done() def fill_queue(self, iterations): for iteration in iterations: self.work_queue.put(iteration) def get_tasks(self): res = [] tasks = self.marathon_cluster.list_tasks() for task in tasks: if not task.id.startswith('health-check-test-'): continue res.append({'id': str(task.id), 'host': str(task.host), 'port': str(task.ports[0])}) return res def create_apps(self): self.fill_queue(range(self.app_count)) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(self.create_app,)) worker.start() self.work_queue.join() while self.app_list_queue.empty() is False: try: app_name = self.app_list_queue.get_nowait() except Empty: continue self.work_queue.put(app_name) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(self.wait_instances,)) worker.start() self.work_queue.join() def start_test(self): task_list = self.get_tasks() for action in self.action_list: if isinstance(action, six.text_type): if action.startswith('sleep='): amount = int(action.split('=')[1]) sleep(60*amount) continue self.fill_queue(task_list) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(action,)) worker.start() self.work_queue.join() def generate_report(self): today = datetime.today() file_prefix = "{:%Y-%m-%d_%H_%M_%S-}".format(today) file_name = (file_prefix + 'health_check_result-' + str(self.total_tasks_cout) + 'tasks.csv') f = open(file_name, "w") f.write("Task ID,Health check timestamp") while self.result_queue.empty() is False: try: result = self.result_queue.get_nowait() except Empty: continue for timestamp in result['data']: f.write("\n%s,%s" % (result['id'], timestamp)) f.close()
class MarathonSpawner(Spawner): app_image = Unicode("jupyterhub/singleuser", config=True) app_prefix = Unicode( "jupyter", help=dedent( """ Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """ ) ).tag(config=True) marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) ports = List( [8888], help='Ports to expose externally' ).tag(config=True) volumes = List( [], help=dedent( """ A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """ ) ).tag(config=True) network_mode = Unicode( 'BRIDGE', help="Enum of BRIDGE or HOST" ).tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub" ).tag(config=True) hub_port_connect = Integer( -1, help="Public PORT of the hub" ).tag(config=True) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """ ).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host) @property def container_name(self): return '/%s/%s' % (self.app_prefix, self.user.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append(MarathonHealthCheck( protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=60, timeout_seconds=20, max_consecutive_failures=0 )) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name(mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name(mv.external['name'], self) volumes.append(mv) return volumes def get_port_mappings(self): port_mappings = [] for p in self.ports: port_mappings.append( MarathonContainerPortMapping( container_port=p, host_port=0, protocol='tcp' ) ) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) return (ip, app_info.tasks[0].ports[0]) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse(( uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment )) def get_env(self): env = super(MarathonSpawner, self).get_env() env.update(dict( # Jupyter Hub config JPY_USER=self.user.name, JPY_COOKIE_NAME=self.user.server.cookie_name, JPY_BASE_URL=self.user.server.base_url, JPY_HUB_PREFIX=self.hub.server.base_url, )) if self.notebook_dir: env['NOTEBOOK_DIR'] = self.notebook_dir if self.hub_ip_connect or self.hub_port_connect > 0: hub_api_url = self._public_hub_api_url() else: hub_api_url = self.hub.api_url env['JPY_HUB_API_URL'] = hub_api_url return env @gen.coroutine def start(self): docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer( docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 app_request = MarathonApp( id=self.container_name, env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1 ) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment(status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
class MarathonHTTPClient(object): def __init__(self, target, auth, options, pkey): self.target = settings.MARATHON_HOST self.auth = auth self.options = options self.pkey = pkey self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT self.client = MarathonClient('http://'+self.target+':8180') self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options, pkey) # helpers def _app_id(self, name): return name.replace('_', '.') # container api def create(self, name, image, command='', **kwargs): """Create a container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + '/' + image mems = kwargs.get('memory', {}).get(l['c_type']) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get('cpu', {}).get(l['c_type']) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format(**locals()) self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c)) self.client.scale_app(app_id, 0, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def stop(self, name): """Stop a container""" raise NotImplementedError def destroy(self, name): """Destroy a container""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') try: if docker_cli.inspect_container(name)['State']['Running']: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') if docker_cli.inspect_container(name)['State']: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app(self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed def attach(self, name): """ Attach to a job's stdin, stdout and stderr """ raise NotImplementedError
def unlaunch_app(app_id): marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port)) marathon_client.delete_app(app_id)
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id): try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a def delete_app(self, app_id, force=False): return self.mcli.delete_app(app_id, force) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count): cnt = 0 while True: a1 = self.get_app(app) if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(1) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale): return self.mcli.scale_app(app, scale) def ping(self): return self.mcli.ping()
class MarathonHTTPClient(AbstractSchedulerClient): def __init__(self, target, auth, options, pkey): super(MarathonHTTPClient, self).__init__(target, auth, options, pkey) self.target = settings.MARATHON_HOST self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT self.client = MarathonClient('http://' + self.target + ':8180') self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options, pkey) # helpers def _app_id(self, name): return name.replace('_', '.') # container api def create(self, name, image, command='', **kwargs): """Create a new container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + '/' + image mems = kwargs.get('memory', {}).get(l['c_type']) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get('cpu', {}).get(l['c_type']) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format( **locals()) self.client.create_app( app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0)) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container.""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def destroy(self, name): """Destroy a container.""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') try: if docker_cli.inspect_container(name)['State']['Running']: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') if docker_cli.inspect_container(name)['State']: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command.""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): """Display the given job's running state.""" try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app( self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id): try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a def delete_app(self, app_id, force=False): return self.mcli.delete_app(app_id, force) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise def create_app(self, app_id, attr): for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count): cnt = 0 while True: a1 = self.get_app(app) if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(1) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale): return self.mcli.scale_app(app, scale) def ping(self): return self.mcli.ping()
class MarathonHTTPClient(AbstractSchedulerClient): def __init__(self, target, auth, options, pkey): super(MarathonHTTPClient, self).__init__(target, auth, options, pkey) self.target = settings.MARATHON_HOST self.registry = settings.REGISTRY_HOST + ":" + settings.REGISTRY_PORT self.client = MarathonClient("http://" + self.target + ":8180") self.fleet = FleetHTTPClient("/var/run/fleet.sock", auth, options, pkey) # helpers def _app_id(self, name): return name.replace("_", ".") # container api def create(self, name, image, command="", **kwargs): """Create a new container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + "/" + image mems = kwargs.get("memory", {}).get(l["c_type"]) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get("cpu", {}).get(l["c_type"]) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format(**locals()) self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0)) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container.""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def destroy(self, name): """Destroy a container.""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17") try: if docker_cli.inspect_container(name)["State"]["Running"]: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17") if docker_cli.inspect_container(name)["State"]: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command.""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): """Display the given job's running state.""" try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app(self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed
parser.add_argument("-e", "--execute", help="Operation execute", choices=['delete', 'create'], required=True) parser.add_argument("-d", "--delete", help="Delete all applications", action="store_true") parser.add_argument("-c", "--concurrency", help="Concurrency") parser.add_argument("-n", "--nodes", help="Number of tasks per application") parser.add_argument("-s", "--silent", help="Print only results", action="store_true") args = parser.parse_args() cluster = MarathonClient(args.marathon, timeout=240) if args.execute == "delete": cluster = MarathonClient(args.marathon) all_apps = cluster.list_apps() for app in all_apps: print("Delete {}".format(app.id)) cluster.delete_app(app.id, force=True) if args.execute == "create": concur = 1 if args.concurrency is None else args.concurrency nodes = 1 if args.nodes is None else args.nodes concur_create_apps(int(concur), int(nodes))
class MarathonSpawner(Spawner): app_image = Unicode("jupyterhub/singleuser:%s" % _jupyterhub_xy, config=True) app_prefix = Unicode("jupyter", help=dedent(""" Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """)).tag(config=True) marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) ports = List([8888], help='Ports to expose externally').tag(config=True) volumes = List([], help=dedent(""" A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """)).tag(config=True) network_mode = Unicode('BRIDGE', help="Enum of BRIDGE or HOST").tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub").tag(config=True) @observe('hub_ip_connect') def _ip_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_ip_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_ip instead.", DeprecationWarning, ) hub_port_connect = Integer(-1, help="Public PORT of the hub").tag(config=True) @observe('hub_port_connect') def _port_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_port_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_port instead.", DeprecationWarning, ) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name # fix default port to 8888, used in the container @default('port') def _port_default(self): return 8888 # default to listening on all-interfaces in the container @default('ip') def _ip_default(self): return '0.0.0.0' _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host) @property def container_name(self): return '/%s/%s' % (self.app_prefix, self.user.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append( MarathonHealthCheck(protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=30, timeout_seconds=20, max_consecutive_failures=0)) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name( mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name( mv.external['name'], self) volumes.append(mv) return volumes def get_port_mappings(self): port_mappings = [] for p in self.ports: port_mappings.append( MarathonContainerPortMapping(container_port=p, host_port=0, protocol='tcp')) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) return constraints @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) return (ip, app_info.tasks[0].ports[0]) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse((uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment)) def get_args(self): args = super().get_args() if self.hub_ip_connect: # JupyterHub 0.7 specifies --hub-api-url # on the command-line, which is hard to update for idx, arg in enumerate(list(args)): if arg.startswith('--hub-api-url='): args.pop(idx) break args.append('--hub-api-url=%s' % self._public_hub_api_url()) return args @gen.coroutine def start(self): docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer(docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 cmd = self.cmd + self.get_args() app_request = MarathonApp(id=self.container_name, cmd=' '.join(cmd), env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1, accepted_resource_roles=['*']) self.log.info("Creating App: %s", app_request) self.log.info("self.marathon: %s", self.marathon) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) self.log.error("app: %s", app) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment( status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
class HealthCheckBencher(object): def __init__(self, marathon_url, image, tasks): self.concurrency = 20 self.docker_image = image self.app_base_name = 'health-check-test-' self.total_tasks_cout = int(tasks) self.instances_per_app = 50 if tasks < self.instances_per_app: self.instances_per_app = self.total_tasks_cout self.app_count = 1 else: self.app_count = self.total_tasks_cout/self.instances_per_app self.heath_check_interval = 30 self.test_duration = 20 self.marathon_cluster = MarathonClient(marathon_url, timeout=240) self.work_queue = Queue() self.result_queue = Queue() self.app_list_queue = Queue() self.action_list = [self.start_collect, 'sleep={}'.format(self.test_duration), self.get_stats] def remove_apps(self): apps = self.marathon_cluster.list_apps() for app in apps: if app.id.startswith("/"+self.app_base_name): self.marathon_cluster.delete_app(app.id) active = 0 while True: apps = self.marathon_cluster.list_apps() for app in apps: if app.id.startswith(self.app_base_name): active += 1 if active == 0: break def create_app(self, id): port_mapping = MarathonContainerPortMapping(container_port=80, protocol="tcp") app_docker = MarathonDockerContainer( image=self.docker_image, network="BRIDGE", force_pull_image=True, port_mappings=[port_mapping]) app_container = MarathonContainer(docker=app_docker) http_health_check = MarathonHealthCheck( protocol="HTTP", path="/status", grace_period_seconds=300, interval_seconds=self.heath_check_interval, timeout_seconds=20, max_consecutive_failures=0 ) app_suffix = str(md5(str(random())).hexdigest()) app_name = self.app_base_name + app_suffix new_app = MarathonApp(cpus=CPUS, mem=MEM, disk=DISK, container=app_container, health_checks=[http_health_check], instances=self.instances_per_app, max_launch_delay_seconds=5) print("Creating {}".format(app_name)) self.marathon_cluster.create_app(app_id=app_name, app=new_app) self.app_list_queue.put(app_name) return None def wait_instances(self, app_name): health_ok = 0 while health_ok < self.instances_per_app: health_ok = 0 tasks = self.marathon_cluster.list_tasks(app_name) for task in tasks: if task.health_check_results: health_ok += 1 def start_collect(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/start_collect' res = urlopen(url) if res.getcode() == 200: print(task['id']+': collecter was started') else: print(task['id']+': failed to start collecter') def stop_collect(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/stop_collect' res = urlopen(url) if res.getcode() == 200: print(task['id']+': collecter was stopped') else: print(task['id']+': failed to stop collecter') def clear_stats(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/clear_stats' res = urlopen(url) if res.getcode() == 200: print(task['id']+': stats was dropped') else: print(task['id']+': stats was dropped') def get_stats(self, task): url = 'http://'+task['host']+':'+str(task['port'])+'/get_timestamps' try: res = urlopen(url) except Exception: print("URL req failed") self.result_queue.put({'id': task['id'], 'status': 'Failed', 'data': []}) return if res.getcode() == 200: data = res.read() timestamps = data.split(',') self.result_queue.put({'id': task['id'], 'status': 'ok', 'data': timestamps}) elif res.getcode() == 202: print("Collecting is not enabled") self.result_queue.put({'id': task['id'], 'status': 'Collecting is not enabled', 'data': []}) else: print("Unknown response code") self.result_queue.put({'id': task['id'], 'status': 'Unknown response code', 'data': []}) def repeat(self, action): while self.work_queue.empty() is False: try: iteration = self.work_queue.get_nowait() except Empty: continue action(iteration) self.work_queue.task_done() def fill_queue(self, iterations): for iteration in iterations: self.work_queue.put(iteration) def get_tasks(self): res = [] tasks = self.marathon_cluster.list_tasks() for task in tasks: if not task.id.startswith('health-check-test-'): continue res.append({'id': str(task.id), 'host': str(task.host), 'port': str(task.ports[0])}) return res def create_apps(self): self.fill_queue(range(self.app_count)) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(self.create_app,)) worker.start() self.work_queue.join() while self.app_list_queue.empty() is False: try: app_name = self.app_list_queue.get_nowait() except Empty: continue self.work_queue.put(app_name) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(self.wait_instances,)) worker.start() self.work_queue.join() def start_test(self): task_list = self.get_tasks() for action in self.action_list: if isinstance(action, basestring): if action.startswith('sleep='): amount = int(action.split('=')[1]) sleep(60*amount) continue self.fill_queue(task_list) for thread_num in range(self.concurrency): if self.work_queue.empty() is True: break worker = Thread(target=self.repeat, args=(action,)) worker.start() self.work_queue.join() def generate_report(self): today = datetime.today() file_prefix = "{:%Y-%m-%d_%H_%M_%S-}".format(today) file_name = (file_prefix + 'health_check_result-' + str(self.total_tasks_cout) + 'tasks.csv') f = open(file_name, "w") f.write("Task ID,Health check timestamp") while self.result_queue.empty() is False: try: result = self.result_queue.get_nowait() except Empty: continue for timestamp in result['data']: f.write("\n%s,%s" % (result['id'], timestamp)) f.close()
import time from optparse import OptionParser from marathon import MarathonClient from marathon.models import MarathonApp if __name__ == '__main__': usage = ('python %prog') parser = OptionParser(description='Simple marathon-python based master to launch apps', version="0.1 ", usage=usage) (options, args) = parser.parse_args() if (len(args) != 0): parser.print_help() sys.exit(2) print "Initiating marathonclient..." c = MarathonClient('http://localhost:8080') app_cmd = "python /home/abdullah/cosmic-space/test-mesos/py-zmq/sub_client.py --server_ip_ports 10.10.0.2:5556" # launch app print "Initiating zmq-client app" c.create_app('zmq-client', MarathonApp(cmd=app_cmd, mem=16, cpus=0.01)) # scale raw_input("scale_apps upto 400") c.scale_app('zmq-client', instances=400) # delete raw_input("delete apps") c.delete_app('zmq-client')
class MarathonWorkers(object): def __init__(self, scheduler, marathon, name=None, nprocs=1, nthreads=0, docker='daskos/daskathon', volumes=[], **kwargs): self.scheduler = scheduler self.executor = ThreadPoolExecutor(1) self.client = MarathonClient(marathon) self.name = name or 'dask-%s' % uuid.uuid4() self.docker = docker self.volumes = volumes self.nprocs = nprocs self.nthreads = nthreads self.options = kwargs def start(self, nworkers=0): # address = self.scheduler.address.replace('tcp://', '') args = ['dask-worker', self.scheduler.address, '--name', '$MESOS_TASK_ID', # use Mesos task ID as worker name '--worker-port', '$PORT_WORKER', '--bokeh-port', '$PORT_BOKEH', '--nanny-port', '$PORT_NANNY', '--nprocs', str(self.nprocs), '--nthreads', str(self.nthreads)] ports = [{'port': 0, 'protocol': 'tcp', 'name': name} for name in ['worker', 'nanny', 'http', 'bokeh']] # healths = [{'portIndex': i, # 'protocol': 'TCP', # 'gracePeriodSeconds': 300, # 'intervalSeconds': 60, # 'timeoutSeconds': 20, # 'maxConsecutiveFailures': 3} # for i, name in enumerate(['worker', 'nanny', 'http', 'bokeh'])] healths = [] if 'mem' in self.options: args.extend(['--memory-limit', str(int(self.options['mem'] * 0.8 * 1e6))]) docker_parameters = [{"key": "volume", "value": v} for v in self.volumes] container = MarathonContainer({'image': self.docker, 'forcePullImage': True, 'parameters': docker_parameters}) command = ' '.join(args) app = MarathonApp(instances=nworkers, container=container, port_definitions=ports, cmd=command, health_checks=healths, **self.options) self.client.update_app(self.name, app) logger.info('Started marathon workers {}'.format(self.name)) def close(self): logger.info('Stopping marathon workers {}'.format(self.name)) self.client.delete_app(self.name, force=True) def scale_up(self, n): self.executor.submit(self.client.scale_app, self.name, instances=n) def scale_down(self, workers): for worker in workers: self.executor.submit(self.client.kill_task, self.name, self.scheduler.worker_info[worker]['name'], scale=True)
parser = argparse.ArgumentParser() parser.add_argument("-m", "--marathon", help="Marathon URL, on example " "http://127.0.0.1:8080/marathon", required=True) parser.add_argument("-e", "--execute", help="Operation execute", choices=['delete', 'create'], required=True) parser.add_argument("-d", "--delete", help="Delete all applications", action="store_true") parser.add_argument("-c", "--concurrency", help="Concurrency") parser.add_argument("-n", "--nodes", help="Number of tasks per application") parser.add_argument("-s", "--silent", help="Print only results", action="store_true") args = parser.parse_args() cluster = MarathonClient(args.marathon, timeout=240) if args.execute == "delete": cluster = MarathonClient(args.marathon) all_apps = cluster.list_apps() for app in all_apps: print("Delete {}".format(app.id)) cluster.delete_app(app.id, force=True) if args.execute == "create": concur = 1 if args.concurrency is None else args.concurrency nodes = 1 if args.nodes is None else args.nodes concur_create_apps(int(concur), int(nodes))
class MarathonSpawner(Spawner): # Load the app image app_image = Unicode("jupyterhub/singleuser", config=True) # The command to run app_cmd = Unicode("jupyter notebook", config=True) # This is the prefix in Marathon app_prefix = Unicode( "jupyter", help=dedent( """ Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """ ) ).tag(config=True) user_web_port = Integer(0, help="Port that the Notebook is listening on").tag(config=True) user_ssh_port = Integer(0, help="SSH Port that the container is listening on").tag(config=True) user_ssh_host = Unicode('', help="Hostname of the ssh container").tag(config=True) use_jupyterlab = Integer(0, help="Use Jupyterlab - Jupyterlab is 1 default is 0 or Jupyternotebook").tag(config=True) user_ssh_hagroup = Unicode('', help="HAProxy group for ssh container port").tag(config=True) # zeta_user_file are the users and their custom settings for installation in Zeta Architechure. If this is blank, defaults from Jupyter Hub are used for Mem, CPU, Ports, Image. If this is not blank, we will read from that file zeta_user_file = Unicode( "", help="Path to json file that includes users and per user settings" ).tag(config=True) no_user_file_fail = Bool( True, help="Is zeta_user_file is provided, but can't be opened fail. (Default). False loads defaults and tries to spawn" ).tag(config=True) # Marathon Server marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_user_name = Unicode( u'', help='Marathon user name' ).tag(config=True) marathon_user_password = Unicode( u'', help='Marathon user password' ).tag(config=True) fetch = List([], help='Optional files to fetch').tag(config=True) custom_env = List( [], help='Additional ENVs to add to the default. Format is a list of 1 record dictionary. [{key:val}]' ).tag(config=True) # Constraints in Marathon marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) # Shared Notebook location shared_notebook_dir = Unicode( '', help="Shared Notebook location that users will get a link to in their notebook location - can be blank" ).tag(config=True) ports = List( [8888], help='Ports to expose externally' ).tag(config=True) volumes = List( [], help=dedent( """ A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """ ) ).tag(config=True) network_mode = Unicode( 'BRIDGE', help="Enum of BRIDGE or HOST" ).tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub" ).tag(config=True) hub_port_connect = Integer( -1, help="Public PORT of the hub" ).tag(config=True) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """ ).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host, self.marathon_user_name, self.marathon_user_password) @property def container_name(self): self.log.info("Container Name : %s / %s / %s",self.app_prefix, self.user.name, self.name ) try: self.log.info("Debug %s", json.dumps(self.name)) except: self.log.info("Could not log self") return '/%s/%s%s' % (self.app_prefix, self.user.name, self.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append(MarathonHealthCheck( protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=60, timeout_seconds=20, max_consecutive_failures=0 )) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name(mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name(mv.external['name'], self) volumes.append(mv) out_vols = [] dups = {} #Remove Duplicates there should be only one container path point for container for x in volumes: if x.container_path in dups: pass else: out_vols.append(x) dups[x.container_path] = 1 return out_vols def get_app_cmd(self): retval = self.app_cmd.replace("{username}", self.user.name) retval = retval.replace("{userwebport}", str(self.user_web_port)) if self.use_jupyterlab == 1: print("This is where I should do some thing if I want to run Jupyter lab") if self.user_ssh_hagroup != "": retval = retval.replace("{usersshport}", "$PORT0") else: retval = retval.replace("{usersshport}", str(self.user_ssh_port)) return retval def get_port_mappings(self): port_mappings = [] if self.network_mode == "BRIDGE": for p in self.ports: port_mappings.append( MarathonContainerPortMapping( container_port=p, host_port=0, protocol='tcp' ) ) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) port = app_info.tasks[0].ports[0] return (ip, port) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse(( uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment )) def get_env(self): env = super(MarathonSpawner, self).get_env() env.update(dict( # Jupyter Hub config JPY_USER=self.user.name, # JPY_COOKIE_NAME=self.user.server.cookie_name, # JPY_BASE_URL=self.user.server.base_url, JPY_HUB_PREFIX=self.hub.server.base_url, JPY_USER_WEB_PORT=str(self.user_web_port), JPY_USER_SSH_PORT=str(self.user_ssh_port), JPY_USER_SSH_HOST=str(self.user_ssh_host) )) if self.notebook_dir: env['NOTEBOOK_DIR'] = self.notebook_dir if self.hub_ip_connect or self.hub_port_connect > 0: hub_api_url = self._public_hub_api_url() else: hub_api_url = self.hub.api_url env['JPY_HUB_API_URL'] = hub_api_url for x in self.custom_env: for k,v in x.items(): env[k] = str(v) return env def update_users(self): # No changes if the zeta_user_file is blank if self.zeta_user_file != "": try: j = open(self.zeta_user_file, "r") user_file = j.read() j.close() user_ar = {} for x in user_file.split("\n"): if x.strip().find("#") != 0 and x.strip() != "": y = json.loads(x) if y['user'] == self.user.name: user_ar = y break if len(user_ar) == 0: self.log.error("Could not find current user %s in zeta_user_file %s - Not Spawning" % (self.user.name, self.zeta_user_file)) if self.no_user_file_fail == True: raise Exception('no_user_file_fail is True, will not go on') print("User List identified and loaded, setting values to %s" % user_ar) self.cpu_limit = user_ar['cpu_limit'] self.mem_limit = user_ar['mem_limit'] self.user_ssh_port = user_ar['user_ssh_port'] self.user_web_port = user_ar['user_web_port'] self.user_ssh_host = user_ar['user_ssh_host'] try: self.user_ssh_hagroup = user_ar['user_ssh_hagroup'] except: self.user_ssh_hagroup = "" try: self.use_jupyterlab = int(user_ar['use_jupyterlab']) except: self.use_jupyterlab = 0 self.network_mode = user_ar['network_mode'] self.app_image = user_ar['app_image'] self.marathon_constraints = user_ar['marathon_constraints'] self.ports.append(self.user_web_port) self.ports.append(self.user_ssh_port) self.custom_env = self.custom_env + user_ar['custom_env'] self.volumes = self.volumes + user_ar['volumes'] print("User List Loaded!") # { "user": "******", "cpu_limit": "1", "mem_limit": "2G", "user_ssh_port": 10500, "user_web_port:" 10400, "network_mode": "BRIDGE", "app_image": "$APP_IMG", "marathon_constraints": []} except: self.log.error("Could not find or open zeta_user_file: %s" % self.zeta_user_file) if self.no_user_file_fail == True: raise Exception("Could not open file and config says don't go on") @gen.coroutine def start(self): # First make a quick call to determine if user info was updated self.update_users() # Go on to start the notebook docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer( docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 if self.user_ssh_hagroup != "": myports = [self.user_ssh_port] labels = {"HAPROXY_GROUP": self.user_ssh_hagroup, "HA_EDGE_CONF": "1"} else: labels = {} myports = [] app_request = MarathonApp( id=self.container_name, cmd=self.get_app_cmd(), env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1, labels=labels, ports=myports, fetch=self.fetch, ) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment(status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id, timeout=300): st_time = time.time() while(time.time() - st_time < timeout): try: try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a except: l.info("mcli: get_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli get_app timed out, possible zookeper/marathon/mesos malfunction") def delete_app(self, app_id, force=False, timeout=200): st_time = time.time() while(time.time() - st_time < timeout): try: self.mcli.delete_app(app_id, force) return except: l.info("mcli: delete_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli delete_app timed out, possible zookeper/marathon/mesos malfunction") def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count, sleep_before_next_try=1): cnt = 0 while True: a1 = self.get_app(app) # if tasks_running are greater (due to whatever reason, scale down accordingly) if a1.tasks_running > running_count: delta = a1.tasks_running - running_count l.info("Found [%d] more apps, scaling down to [%d]", delta, running_count) self.scale_app(app, running_count) # Allow for some time before next poll time.sleep(1) continue if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(sleep_before_next_try) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale, timeout=300): st_time = time.time() while(time.time() - st_time < timeout): try: self.mcli.scale_app(app, scale) return except: l.info("mcli: scale_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli scale_app timed out, possible zookeper/marathon/mesos malfunction") def ping(self): return self.mcli.ping() def kill_task(self, app_id, task_id): return self.mcli.kill_task(app_id, task_id)
from time import time, sleep from distributed import Client from daskathon import MarathonCluster from marathon import MarathonClient cg = MarathonClient('http://localhost:8080') for app in cg.list_apps(): cg.delete_app(app.id, force=True) def test_multiple_workers(): with MarathonCluster(nworkers=2, marathon='http://localhost:8080', scheduler_port=9001, diagnostics_port=9101) as mc: while len(mc.scheduler.workers) < 2: sleep(0.1) with Client(mc.scheduler_address) as c: x = c.submit(lambda x: x + 1, 1) assert x.result() == 2 def test_manual_scaling(): with MarathonCluster(marathon='http://localhost:8080', scheduler_port=9002, diagnostics_port=9102) as mc: assert not mc.scheduler.ncores
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id, timeout=300): st_time = time.time() while (time.time() - st_time < timeout): try: try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a except: l.info("mcli: get_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli get_app timed out, possible zookeper/marathon/mesos malfunction" ) def delete_app(self, app_id, force=False, timeout=200): st_time = time.time() while (time.time() - st_time < timeout): try: self.mcli.delete_app(app_id, force) return except: l.info("mcli: delete_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli delete_app timed out, possible zookeper/marathon/mesos malfunction" ) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error( "Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str( e ).find('App is locked by one or more deployments. Override with the option' ) >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count, sleep_before_next_try=1): cnt = 0 while True: a1 = self.get_app(app) # if tasks_running are greater (due to whatever reason, scale down accordingly) if a1.tasks_running > running_count: delta = a1.tasks_running - running_count l.info("Found [%d] more apps, scaling down to [%d]", delta, running_count) self.scale_app(app, running_count) # Allow for some time before next poll time.sleep(1) continue if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(sleep_before_next_try) if (cnt % 30) == 29: l.info( "[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale, timeout=300): st_time = time.time() while (time.time() - st_time < timeout): try: self.mcli.scale_app(app, scale) return except: l.info("mcli: scale_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli scale_app timed out, possible zookeper/marathon/mesos malfunction" ) def ping(self): return self.mcli.ping() def kill_task(self, app_id, task_id): return self.mcli.kill_task(app_id, task_id)