def rolling_replace_app(service_name, app1_id, app2_id, app2_config, labels): print ' replacing '+app1_id+' with '+app2_id marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port)) app1 = marathon_client.get_app(app1_id) old_tasks = app1.tasks # launcher.launch(group2.service.name, group2.encode_marathon_id, group2.config, instances = 0) launcher.launch_app(service_name, app2_id, app2_config, labels, instances = 0 ) new_app = marathon_client.get_app(app2_id) for old_task in old_tasks: # # replace each old task with a new task of the new app # num_started = num_started_tasks(app2_id) new_instances = num_started+1 # add 1 instance of new task launcher.update_app(app2_id, app2_config, new_instances) while num_started < new_instances: time.sleep(1) print 'waiting for app to start '+str(num_started) num_started = num_started_tasks(app2_id) # # take down old task # marathon_client.kill_task(app1_id, old_task.id, scale=True) marathon_client.delete_app(app1_id)
def launch_elsa(marathon, stats_file, scale_window): logging.info('Start monitoring the inbound traffic on topics using %s' % (stats_file)) # make sure the stats file is properly initialized: if not os.path.exists(stats_file): f = open(stats_file, 'w') f.write('0') f.close() # launch the Elsa app via Marathon c = MarathonClient(marathon) c.create_app( 'elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='******')) # c.list_apps() print( 'ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...' ) time.sleep(5) # allow time to deploy before autoscaling sets in # kick off traffic monitoring and trigger autoscaling: previous_topic_traffic = 0 try: while True: with open(stats_file, 'r') as elsa_file: topic_traffic = int(elsa_file.read()) topic_traffic_diff = topic_traffic - previous_topic_traffic print('Difference in traffic in the past %d seconds: %d' % (scale_window, topic_traffic_diff)) previous_topic_traffic = topic_traffic current_instance_num = c.get_app('elsa').instances if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ... instance_multiplier = int( topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances c.scale_app('elsa', current_instance_num * instance_multiplier) print('Increasing number of instances to %d' % (current_instance_num * instance_multiplier)) elif topic_traffic_diff < 0: # negative, back off exponentially target_instance_num = int(current_instance_num / 2) if target_instance_num > 1: c.scale_app('elsa', target_instance_num) print('Decreasing number of instances to %d' % (target_instance_num)) else: c.scale_app('elsa', 1) print('Resetting number of instances to 1') time.sleep(scale_window) except KeyboardInterrupt: print( 'ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!' ) c.delete_app('elsa', force=True)
def update_app(app_id, config, instances = 1): # # set up marathon client and launch container # image_string = 'docker:///' + config['image'] marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port)) app = marathon_client.get_app(app_id) # # set up options for cassandra TODO this is terrible dawg # decoded = namespacer.decode_marathon_id(app_id) options = [] if str(decoded['service']) == "cassandra": options = ["-p", "7000:7000", "-p", "9042:9042", "-p", "9160:9160", "-p", "22000:22", "-p", "5000:5000"] # ports = [] # constraints = [["hostname", "UNIQUE"]] marathon_client.update_app( app_id, app, instances = instances, container = { "image" : image_string, "options" : options } )
class MarathonDeployer(object): def __init__(self, marathon_url): self.url = marathon_url self.client = MarathonClient(self.url) def deploy(self, task_chain, environment_name): deployed_chain = DeployedTaskChain(task_chain, environment_name) for task in deployed_chain.list_all_tasks(): task_id = task['id'] safe_name = task_id.lower() # safe_name = task['name'].replace('.', '').lower() try: if self.client.get_app(safe_name): self.client.delete_app(safe_name) time.sleep(2) except Exception: pass app = MarathonApp(cmd='/var/riversnake/invoke.py {0} {1} {2}'.format( task_chain.flow_name, environment_name, task_id), mem=16, cpus=1) self.client.create_app(safe_name, app)
def send_to_marathon(request): try: if request.method == 'POST': action = request.POST.get('action', None) id = request.POST.get('id', None) mc = MarathonClient('http://{}:{}'.format(settings.MARATHON['host'], settings.MARATHON['port'])) if action == 'stop': mc.scale_app(id, 0, force=True) elif action == 'start': mc.scale_app(id, 1) elif action == 'destroy': if request.user.has_perm("auth.can_init_app"): mc.delete_app(id) else: raise PermissionDenied elif action == 'restart': mc.restart_app(id) elif action == 'scale': mc.scale_app(id, int(request.POST.get('number_instance'))) elif action == 'update': app = mc.get_app(id) app.cpus = float(request.POST.get('cpus')) app.mem = float(request.POST.get('mem')) app.container.docker.image = request.POST.get('version') mc.update_app(id, app) elif action == "stop-deployment": mc.delete_deployment(id) result = '{"status":"success", "msg": "%(action)s success"}'%{"action":action} except Exception as e: result = '{"status":"error", "msg": "%(action)s fail: %(error)s" }'%{"action":action, "error": html.escape(str(e))} return HttpResponse(result)
def update(service, instances = 1): # # set up marathon client and launch container # print 'updating ' + service image_string = 'docker:///' + data['services'][service]['image'] print image_string marathon_client = MarathonClient('http://' + str(data['marathon']['host']) + ':' + str(data['marathon']['port'])) app = marathon_client.get_app(service) # # set up options for cassandra # options = [] if service == "cassandra": options = ["-p", "7000:7000", "-p", "9042:9042", "-p", "9160:9160", "-p", "22000:22", "-p", "5000:5000"] # ports = [] # constraints = [["hostname", "UNIQUE"]] marathon_client.update_app( service, app, instances = instances, container = { "image" : image_string, "options" : options } )
def sync_marathon_app(): """Identify the hosts and ports of executing tasks Optional environment variables: MARATHON_ROOT_URL: protocol, address or ip and port to Marathon MARATHON_APP: app name within Marathon used to group all tasks (server instances) MARATHON_APP_PORT: internal port of service (internal to docker container: default of 8080) :return: """ # Identify the hosts and ports of executing tasks try: c = None if len(DCOS_OAUTH_TOKEN): c = MarathonClient(MARATHON_ROOT_URLS, auth_token=DCOS_OAUTH_TOKEN) else: c = MarathonClient(MARATHON_ROOT_URLS) app = c.get_app(MARATHON_APP) port_index = find_port_index_by_container_port(app, MARATHON_APP_PORT) if port_index is None: raise Exception('Unable to correlate container to host port.') instances = [] for task in app.tasks: logging.info('Queuing configuration refresh of %s at %s:%s' % (task.id, task.host, task.ports[port_index])) instances.append('%s:%s' % (task.host, task.ports[port_index])) reload_config(instances) except MarathonError, ex: print 'Error making Marathon API call: %s' % ex.message
def _update_application(client: MarathonClient, app: MarathonApp, definition_path: str, do_backup: bool = False) -> Union[str, bool]: if do_backup: if not os.path.isdir('./backups'): os.mkdir('./backups/') print('Created backups directory') backup = client.get_app(app.id).to_json() backup_path = './backups/{}_{}.json'.format( mangling.appid_to_filename(app.id), time.strftime("%Y-%m-%d_%H:%M:%S")) with open(backup_path, 'w') as backup_file: backup_file.write(backup) print('\nBacked app into: {}'.format(backup_path)) else: backup_path = '' print('Updating app: {} (from: {})'.format(app.id, definition_path)) deployment = client.update_app(app.id, app, force=True) # TODO: Handle failure # Return the deployed backup file to build rollback order, if necessary # or False if a user-initiated rollback completed successfully if not wait_for_deployment(client, deployment): client.restart_app(app.id) return False if not wait_for_deployment(client, deployment) else backup_path
def in_place_restart(client: MarathonClient, appid: str): pre = client.get_app(appid).instances deployment = client.scale_app(appid, 0) wait_for_deployment(client, deployment) print('Scaled {} down to 0'.format(appid)) deployment = client.scale_app(appid, pre) wait_for_deployment(client, deployment) print('{} back at {} again'.format(appid, pre))
def update_app_tag(client: MarathonClient, appid: str, new_tag: str): app = client.get_app(appid) reg, img = mangling.split_image_name(app.container.docker.image) img, _ = mangling.split_image_tag(img) new_image = mangling.rebuild_image_name(reg, img, new_tag) app.container.docker.image = new_image deployment = client.update_app(appid, app, force=True) wait_for_deployment(client, deployment)
def num_started_tasks(app_id): count = 0 marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port)) app = marathon_client.get_app(app_id) tasks = app.tasks for task in tasks: if task.started_at: count += 1 return count
def get_hosts_dict(self): hosts={} for app in MarathonClient.list_apps(self): for task in MarathonClient.get_app(self,app.id).tasks: host = task.host if not host in hosts: hosts[host]=[] hosts[host].append(task) return hosts
def get_hosts_dict(self): hosts = {} for app in MarathonClient.list_apps(self): for task in MarathonClient.get_app(self, app.id).tasks: host = task.host if not host in hosts: hosts[host] = [] hosts[host].append(task) return hosts
class ServiceRun(): def __init__(self, gluster_directory, list_volumes, transport, stripe = None, replica = None, quota = None): """ Init gluster """ print("init") if gluster_directory is None or gluster_directory == "": raise Exception("You must set te directory to store gluster folume") if transport is None or transport == "": raise Exception("You must set the transport") self.__gluster_directory = gluster_directory self.__list_volumes = list_volumes self.__transport = transport self.__stripe = stripe self.__replica = replica self.__quota = quota self.__is_on_cluster = False self.__marathon_client = MarathonClient(os.environ['MARATHON_URL']) self.__gluster_app_id = os.environ['GLUSTER_APP_ID'] self.__marathon_app = self.__marathon_client.get_app(self.__gluster_app_id) self.__task_id = os.environ['MESOS_TASK_ID'] def __is_already_on_glusterfs(self): gluster = Gluster() peer_manager = gluster.get_peer_manager() peer_status = peer_manager.status() if peer_status["peers"] == 0: return False else: return True def __is_cluster_already_exist(self, list_nodes): for node in list_nodes.itervalues(): gluster = Gluster(node['ip']) peer_status = gluster.get_peer_manager().status() if peer_status["peers"] > 0: return True return False def __wait_all_glusterfs_start(self, list_nodes): loop = True while loop: time.sleep(1) try: for node in list_nodes.itervalues(): gluster = Gluster(node['ip']) peer_status = gluster.get_peer_manager().status() loop = False except Exception,e: loop = True
def get_random_marathon_task(app_id): """Connect to Marathon and return a random task for a given application ID. :param app_id: The Marathon application ID. :return: tuple of the instance IP or hostname and listening port. """ c = MarathonClient("http://{}".format(options.marathon_host)) app = c.get_app(app_id) task_host, task_port = None, None rand = randrange(0, len(app.tasks)) task_host = app.tasks[rand].host task_port = app.tasks[rand].ports[0] return task_host, task_port
class TestCreateApp(unittest.TestCase): """ Test the creation of a Marathon app against a live endpoint. Configure MARATHON_SERVER in tests.config. """ def setUp(self): self._app = get_app() # Generate a random server configuration. self.client = MarathonClient(MARATHON_SERVER) self.client.create_app(app_id=self._app.id, app=self._app) time.sleep(2) # Wait two seconds for the POST to be processed by Marathon. self.app = self.client.get_app(self._app.id) while not self.app.tasks_healthy: # Wait until the app becomes healthy. self.app = self.client.get_app(self._app.id) time.sleep(1) def test_create(self): self.assertIsInstance(self.app, MarathonApp) self.assertIsInstance(self.app.upgrade_strategy, MarathonUpgradeStrategy) self.assertIsInstance(self.app.tasks.pop(), MarathonTask) self.assertIsInstance(self.app.health_checks.pop(), MarathonHealthCheck) def tearDown(self): self.client.delete_app(self.app.id, force=True)
def get_random_marathon_task(app_id): """Connect to Marathon and return a random task for a given application ID. :param app_id: The Marathon application ID. :return: tuple of the instance IP or hostname and listening port. """ c = MarathonClient('http://{}'.format(options.marathon_host)) app = c.get_app(app_id) task_host, task_port = None, None rand = randrange(0, len(app.tasks)) task_host = app.tasks[rand].host task_port = app.tasks[rand].ports[0] return task_host, task_port
def launch_elsa(marathon, stats_file, scale_window): logging.info('Start monitoring the inbound traffic on topics using %s' %(stats_file)) # make sure the stats file is properly initialized: if not os.path.exists(stats_file): f = open(stats_file, 'w') f.write('0') f.close() # launch the Elsa app via Marathon c = MarathonClient(marathon) c.create_app('elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='******')) # c.list_apps() print('ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...') time.sleep(5) # allow time to deploy before autoscaling sets in # kick off traffic monitoring and trigger autoscaling: previous_topic_traffic = 0 try: while True: with open(stats_file, 'r') as elsa_file: topic_traffic = int(elsa_file.read()) topic_traffic_diff = topic_traffic - previous_topic_traffic print('Difference in traffic in the past %d seconds: %d' %(scale_window, topic_traffic_diff)) previous_topic_traffic = topic_traffic current_instance_num = c.get_app('elsa').instances if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ... instance_multiplier = int(topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances c.scale_app('elsa', current_instance_num * instance_multiplier) print('Increasing number of instances to %d' %(current_instance_num * instance_multiplier)) elif topic_traffic_diff < 0: # negative, back off exponentially target_instance_num = int(current_instance_num/2) if target_instance_num > 1: c.scale_app('elsa', target_instance_num) print('Decreasing number of instances to %d' %(target_instance_num)) else: c.scale_app('elsa', 1) print('Resetting number of instances to 1') time.sleep(scale_window) except KeyboardInterrupt: print('ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!') c.delete_app('elsa', force=True)
def sync_marathon_app(): """Identify the hosts and ports of executing tasks Optional environment variables: MARATHON_ROOT_URL: protocol, address or ip and port to Marathon MARATHON_APP: app name within Marathon used to group all tasks (server instances) MARATHON_APP_PORT: internal port of service (internal to docker container: default of 8080) :return: """ # Identify the hosts and ports of executing tasks try: c = MarathonClient(MARATHON_ROOT_URL) app = c.get_app(MARATHON_APP) container_port = MARATHON_APP_PORT port_index = None if app and app.container and app.container.docker and app.container.docker.port_mappings: for i in range(len(app.container.docker.port_mappings)): if container_port == app.container.docker.port_mappings[i].container_port: # Set port index to use for identifying the exposed port # that maps to internal container port port_index = i break if port_index is None: raise Exception('Unable to correlate container to host port.') instances = [] for task in app.tasks: logging.info('Queuing configuration refresh of %s at %s:%s' % (task.id, task.host, task.ports[port_index])) instances.append('%s:%s' % (task.host, task.ports[port_index])) reload_config(instances) except MarathonError, ex: print 'Error making Marathon API call: %s' % ex.message
def send_to_marathon(request): try: if request.method == 'POST': action = request.POST.get('action', None) id = request.POST.get('id', None) mc = MarathonClient('http://{}:{}'.format( settings.MARATHON['host'], settings.MARATHON['port'])) if action == 'stop': mc.scale_app(id, 0, force=True) elif action == 'start': mc.scale_app(id, 1) elif action == 'destroy': if request.user.has_perm("auth.can_init_app"): mc.delete_app(id) else: raise PermissionDenied elif action == 'restart': mc.restart_app(id) elif action == 'scale': mc.scale_app(id, int(request.POST.get('number_instance'))) elif action == 'update': app = mc.get_app(id) app.cpus = float(request.POST.get('cpus')) app.mem = float(request.POST.get('mem')) app.container.docker.image = request.POST.get('version') mc.update_app(id, app) elif action == "stop-deployment": mc.delete_deployment(id) result = '{"status":"success", "msg": "%(action)s success"}' % { "action": action } except Exception as e: result = '{"status":"error", "msg": "%(action)s fail: %(error)s" }' % { "action": action, "error": html.escape(str(e)) } return HttpResponse(result)
class MarathonSpawner(Spawner): app_image = Unicode("jupyterhub/singleuser", config=True) app_prefix = Unicode( "jupyter", help=dedent( """ Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """ ) ).tag(config=True) marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) ports = List( [8888], help='Ports to expose externally' ).tag(config=True) volumes = List( [], help=dedent( """ A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """ ) ).tag(config=True) network_mode = Unicode( 'BRIDGE', help="Enum of BRIDGE or HOST" ).tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub" ).tag(config=True) hub_port_connect = Integer( -1, help="Public PORT of the hub" ).tag(config=True) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """ ).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host) @property def container_name(self): return '/%s/%s' % (self.app_prefix, self.user.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append(MarathonHealthCheck( protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=60, timeout_seconds=20, max_consecutive_failures=0 )) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name(mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name(mv.external['name'], self) volumes.append(mv) return volumes def get_port_mappings(self): port_mappings = [] for p in self.ports: port_mappings.append( MarathonContainerPortMapping( container_port=p, host_port=0, protocol='tcp' ) ) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) return (ip, app_info.tasks[0].ports[0]) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse(( uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment )) def get_env(self): env = super(MarathonSpawner, self).get_env() env.update(dict( # Jupyter Hub config JPY_USER=self.user.name, JPY_COOKIE_NAME=self.user.server.cookie_name, JPY_BASE_URL=self.user.server.base_url, JPY_HUB_PREFIX=self.hub.server.base_url, )) if self.notebook_dir: env['NOTEBOOK_DIR'] = self.notebook_dir if self.hub_ip_connect or self.hub_port_connect > 0: hub_api_url = self._public_hub_api_url() else: hub_api_url = self.hub.api_url env['JPY_HUB_API_URL'] = hub_api_url return env @gen.coroutine def start(self): docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer( docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 app_request = MarathonApp( id=self.container_name, env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1 ) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment(status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
#参考链接:https://github.com/thefactory/marathon-python server = "http://10.30.0.6:8080" maraclient = MarathonClient(servers=server) # 显示所有app信息 applist = maraclient.list_apps() #print(applist) appId = "/java/activity-view" # 获取app信息 app_info = maraclient.get_app(app_id=appId) #print(app_info) print(app_info.instances) # 重启APP #maraclient.restart_app(app_id=appId) from marathon.models import MarathonApp #创建APP #maraclient.create_app("/web/nginx",MarathonApp(mem=256, cpus=0.1) #扩缩容APP #maraclient.scale_app("nginx",instances=1) #maraclient.scale_app("nginx",delta=-1)
class Scaler: """Class for Scaling""" def __init__(self, app_name, config): self.logger = logging.getLogger("autoscaling") self.logger.setLevel(logging.DEBUG) self.logger.debug("Init object scaler...") self.config = config self.logger.debug("Connect RESTful mariadb and get policies...") conn = http.client.HTTPConnection(config["MARIA_RESTFUL"]['host'], config["MARIA_RESTFUL"]['port']) conn.request("GET", "/app/name/" + app_name) json_app = conn.getresponse().read().decode("utf-8") self.app = json.loads(json_app) conn.request("GET", "/app/name/" + app_name + "/policies") json_policies = conn.getresponse().read().decode("utf-8") self.app["policies"] = json.loads(json_policies) self.logger.debug("Connect influxdb and marathon...") self.influx_client = InfluxDBClient(config["INFLUXDB"]["host"], config["INFLUXDB"]["port"], config["INFLUXDB"]["username"], config["INFLUXDB"]["password"], config["INFLUXDB"]["db_name"]) self.marathon_client = MarathonClient('http://' + config["MARATHON"]['host'] + ':' + config["MARATHON"]['port']) self.app["instance"] = self.marathon_client.get_app(app_name).instances self.app["mem"] = self.marathon_client.get_app(app_name).mem self.app["cpus"] = self.marathon_client.get_app(app_name).cpus self.logger.debug("Reconfig haproxy.cfg...") os.system("sudo ./servicerouter.py --marathon http://" + config["MARATHON"]["host"] + ":" + config["MARATHON"]["port"] + " --haproxy-config /etc/haproxy/haproxy.cfg") def setup_logging(self, log_file="autoscaling.log", level=logging.INFO, formatter=None): if (formatter == None): formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(log_file) fh.setLevel(level) fh.setFormatter(formatter) self.logger.addHandler(fh) def get_cpu_usage(self, container_name): """Return cpu usage of container_name @param string container_name container name """ query = "select DERIVATIVE(cpu_cumulative_usage) as cpu_usage from stats where container_name = '" + container_name + "' and time > now()-5m group by time(2s) " result = self.influx_client.query(query) points = result[0]["points"] return (points[0][1] / 1000000000 / self.app["cpus"]) * 100 def get_container_name(self, mesos_task_id): """Return container name mapping with mesos_task_id in messos @param string mesos_task_id """ query = "select container_name from " + self.config["INFLUXDB"][ "ts_mapping"] + " where time>now() - 5m and mesos_task_id = '" + mesos_task_id + "' limit 1" result = self.influx_client.query(query) points = result[0]["points"] return points[0][2] def get_containers_name(self): """Return list all containers name of application have name app_name @param string app_name name of application @return list all containers name of app_name """ tasks = self.marathon_client.list_tasks(self.app["name"]) containers_name = [] for task in tasks: containers_name.append(self.get_container_name(task.id)) return containers_name def avg_mem_usage(self, containers_name): """Return avg memmory usage of all containers in list containers_name @param list containers_name list containers name @return float avg mem usage """ number_container = len(containers_name) containers_name = ["'" + x + "'" for x in containers_name] containers_name = ",".join(containers_name) query = "select memory_usage,container_name from stats where time > now()-5m and container_name in (" + containers_name + ") limit " + str( number_container * 2) result = self.influx_client.query(query) points = result[0]["points"] sum_memory_usage = 0 for point in points: if (point[3] != None): sum_memory_usage += point[3] / (self.app["mem"] * 1048576) * 100 return sum_memory_usage / number_container def avg_cpu_usage(self, containers_name): """Return avg cpu usage of all containers in list containers_name @param list containers_name list containers name @return float avg cpu usage """ number_container = len(containers_name) containers_name = ["'" + x + "'" for x in containers_name] containers_name = ",".join(containers_name) query = "select DERIVATIVE(cpu_cumulative_usage) as cpu_usage,container_name from stats where time > now()-5m and container_name in (" + containers_name + ") group by time(10s),container_name limit " + str( number_container) result = self.influx_client.query(query) points = result[0]["points"] sum_cpu_usage = 0 for point in points: sum_cpu_usage += point[1] / 1000000000 / self.app["cpus"] * 100 return sum_cpu_usage / number_container def scale(self, delta): """sacle app_name (add or remove) delta intances @param string app_name name of application @param int delta number intances add or remove """ new_instance = self.app["instance"] + delta if (new_instance > self.app['max_instances']): new_instance = self.app['max_instances'] if (new_instance < self.app['min_instances']): new_instance = self.app['min_instances'] if (new_instance != self.app["instance"]): self.marathon_client.scale_app(self.app["name"], new_instance) self.logger.debug("Scaling " + self.app["name"] + " to: " + str(new_instance)) self.logger.debug("Waiting for config file haproxy.cfg...") time.sleep(self.config["TIME"]['w_config_ha']) self.logger.debug("Config file haproxy.cfg...") os.system("sudo ./servicerouter.py --marathon http://" + self.config["MARATHON"]["host"] + ":" + self.config["MARATHON"]["port"] + " --haproxy-config /etc/haproxy/haproxy.cfg") self.app["instance"] = self.marathon_client.get_app( self.app["name"]).instances self.logger.debug("Sleep " + str(self.config["TIME"]['after_scale']) + "s...") time.sleep(self.config["TIME"]['after_scale']) def check_rule(self, policie, value): """Check rule and return number intances need scale @param models.Policie policies @param tuple value values of metric @return integer number intances need scale """ delta = {} delta["up"] = 0 delta["down"] = 0 # Check upper_threshold if (value[policie["metric_type"]] > policie["upper_threshold"]): delta['up'] = policie["instances_in"] # Check lower_threshold if (value[policie["metric_type"]] < policie["lower_threshold"]): delta['down'] = policie["instances_out"] return delta def autoscaling(self): while True: try: containers_name = self.get_containers_name() avg_cpu = self.avg_cpu_usage(containers_name) avg_mem = self.avg_mem_usage(containers_name) self.logger.info( "Avg cpu usage, avg memmory usage, current instance: %f %f %d", avg_cpu, avg_mem, self.app["instance"]) rs_detal = {} rs_detal['up'] = 0 rs_detal['down'] = 10 for policie in self.app["policies"]: delta = self.check_rule(policie, (avg_cpu, avg_mem)) if (rs_detal['up'] < delta['up']): rs_detal['up'] = delta['up'] if (rs_detal['down'] > delta['down']): rs_detal['down'] = delta['down'] if (rs_detal['up'] > 0): self.scale(rs_detal['up']) elif (rs_detal['down'] > 0): self.scale(0 - rs_detal['down']) except Exception as e: self.logger.debug(str(e)) finally: time.sleep(self.config["TIME"]['monitor'])
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id): try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a def delete_app(self, app_id, force=False): return self.mcli.delete_app(app_id, force) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count): cnt = 0 while True: a1 = self.get_app(app) if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(1) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale): return self.mcli.scale_app(app, scale) def ping(self): return self.mcli.ping()
#!/usr/bin/env python3 import sys import yaml from marathon import MarathonClient from lambdas import * from models import * args = parse_args() manager = MarathonManager(get_marathon_url(args)) marathon_client = MarathonClient(get_marathon_url(args)) instances = {} for instance_name in sorted(args['instances'].keys()): try: seed_app = marathon_client.get_app('{}/cassandra-seed'.format(instance_name)) # node_app = marathon_client.get_app('/{}/cassandra-node'.format(env_name)) status = { 'flags': { 'active': True, 'converging': False, 'failed': seed_app.tasks_unhealthy > 0 } } seed_tasks_running = seed_app.tasks_running # node_tasks_running = node_app.tasks_running interfaces = { 'compute': {
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id): try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a def delete_app(self, app_id, force=False): return self.mcli.delete_app(app_id, force) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise def create_app(self, app_id, attr): for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count): cnt = 0 while True: a1 = self.get_app(app) if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(1) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale): return self.mcli.scale_app(app, scale) def ping(self): return self.mcli.ping()
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id, timeout=300): st_time = time.time() while (time.time() - st_time < timeout): try: try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a except: l.info("mcli: get_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli get_app timed out, possible zookeper/marathon/mesos malfunction" ) def delete_app(self, app_id, force=False, timeout=200): st_time = time.time() while (time.time() - st_time < timeout): try: self.mcli.delete_app(app_id, force) return except: l.info("mcli: delete_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli delete_app timed out, possible zookeper/marathon/mesos malfunction" ) def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error( "Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str( e ).find('App is locked by one or more deployments. Override with the option' ) >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count, sleep_before_next_try=1): cnt = 0 while True: a1 = self.get_app(app) # if tasks_running are greater (due to whatever reason, scale down accordingly) if a1.tasks_running > running_count: delta = a1.tasks_running - running_count l.info("Found [%d] more apps, scaling down to [%d]", delta, running_count) self.scale_app(app, running_count) # Allow for some time before next poll time.sleep(1) continue if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(sleep_before_next_try) if (cnt % 30) == 29: l.info( "[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale, timeout=300): st_time = time.time() while (time.time() - st_time < timeout): try: self.mcli.scale_app(app, scale) return except: l.info("mcli: scale_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception( "mcli scale_app timed out, possible zookeper/marathon/mesos malfunction" ) def ping(self): return self.mcli.ping() def kill_task(self, app_id, task_id): return self.mcli.kill_task(app_id, task_id)
app_definition = MarathonApp.from_json(json.loads(marathon_app)) try: logging.info("Connecting to Marathon...") client = MarathonClient(marathon_urls, username=marathon_user, password=marathon_password, verify=False) except MarathonError as e: logging.error("Failed to connect to Marathon! {}".format(e)) exit_code = 1 sys.exit(exit_code) logging.info("Deploying application...") try: app = client.get_app(marathon_app_id) except MarathonHttpError: response = client.create_app(marathon_app_id, app_definition) version = response.version depolyment_id = response.deployments[0].id else: response = client.update_app(marathon_app_id, app_definition, force=marathon_force) version = response['version'] deployment_id = response['deploymentId'] logging.info("New version deployed: {}".format(version)) if app_definition.instances == 0: logging.info(
class MarathonHTTPClient(AbstractSchedulerClient): def __init__(self, target, auth, options, pkey): super(MarathonHTTPClient, self).__init__(target, auth, options, pkey) self.target = settings.MARATHON_HOST self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT self.client = MarathonClient('http://' + self.target + ':8180') self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options, pkey) # helpers def _app_id(self, name): return name.replace('_', '.') # container api def create(self, name, image, command='', **kwargs): """Create a new container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + '/' + image mems = kwargs.get('memory', {}).get(l['c_type']) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get('cpu', {}).get(l['c_type']) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format( **locals()) self.client.create_app( app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0)) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container.""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def destroy(self, name): """Destroy a container.""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') try: if docker_cli.inspect_container(name)['State']['Running']: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') if docker_cli.inspect_container(name)['State']: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command.""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): """Display the given job's running state.""" try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app( self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed
def run_app(self): """ Creates the Marathon Server connection. When using json config files, the tool is used to ensure state of the App. If the App doesn't exist, it tries to create it. This tool can also be used to delete Apps from Marathon via either a json config file or using the App name from the command line. """ # if not a single modifier is specified, show the usage string instead # of segfaulting if not any([ self.args.list_apps, self.args.config_file, self.args.get_app, self.args.delete, ]): self.parser.print_help() self.parser.exit() server_str = "http://" + self.marathon_host + ":" + self.marathon_port marathon_server = MarathonClient( server_str, username=self.marathon_user, password=self.marathon_pass, ) # validate socket connection with given host and port if self.api.connect(self.marathon_host, int(self.marathon_port)): self.logger.info('Connection success') else: self.logger.error('Error connecting to Server') raise IOError('Error connecting to Server') # list all apps if flag is called if self.marathon_list_apps: apps = self.api.get_marathon_apps(marathon_server) if self.json: # the JSON encoder in marathon.util recursively translates all # items into JSON friendly structures print( json.dumps( apps, cls=MarathonJsonEncoder, sort_keys=True, indent=4, separators=(',', ': '), )) else: for app in apps: print('{0} => {1}'.format(app.id, app.cmd)) # Config file load, only if we passed the variable if self.marathon_config_file and not self.args.delete: config_file_data = self.api.read_config_file( self.marathon_config_file) # make it possible to load more than 1 app from a single source file if isinstance(config_file_data, dict): self.logger.debug( 'found a single app definition in config file') apps = [config_file_data] elif isinstance(config_file_data, list): self.logger.debug( 'found a list of app definitions in config file') apps = config_file_data else: raise ValueError( 'Input config file appears to be in the wrong format') for app in apps: # get a specific marathon app marathon_app_result = self.api.get_marathon_app( marathon_server, app, app["id"]) self.logger.info('marathon app before updates: ') self.logger.info(marathon_app_result) # update local app data variable with config file values changes_in_json, new_marathon_app = self.api.assign_config_data( app, marathon_app_result) # update a marathon app if there was a change in the json file if changes_in_json: self.logger.info('marathon app after updates: ') self.api.update_marathon_app(marathon_server, app, new_marathon_app) elif self.args.get_app: self.logger.info(self.args.get_app) config_file_data = None marathon_app_result = self.api.get_marathon_app( marathon_server, config_file_data, self.args.get_app) self.logger.info(marathon_app_result) # Delete marathon app if self.args.delete: # check if the named app exists try: marathon_app_result = marathon_server.get_app(self.args.delete) self.logger.info('Deleting %s', self.args.delete) self.api.delete_marathon_app(marathon_server, self.args.delete) except MarathonHttpError as marathon_exception: if re.search('HTTP 404', str(marathon_exception)): self.logger.info( 'app %s does not exist, nothing to delete', self.args.delete) else: self.logger.error(marathon_exception) sys.exit(1) except Exception as e: self.logger.error(e) sys.exit(1)
def get_app_json(self,id): app=MarathonClient.get_app(self,id) return (app.to_json())
def get_app_json(self, id): app = MarathonClient.get_app(self, id) return (app.to_json())
class MarathonHTTPClient(object): def __init__(self, target, auth, options, pkey): self.target = settings.MARATHON_HOST self.auth = auth self.options = options self.pkey = pkey self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT self.client = MarathonClient('http://'+self.target+':8180') self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options, pkey) # helpers def _app_id(self, name): return name.replace('_', '.') # container api def create(self, name, image, command='', **kwargs): """Create a container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + '/' + image mems = kwargs.get('memory', {}).get(l['c_type']) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get('cpu', {}).get(l['c_type']) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format(**locals()) self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c)) self.client.scale_app(app_id, 0, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def stop(self, name): """Stop a container""" raise NotImplementedError def destroy(self, name): """Destroy a container""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') try: if docker_cli.inspect_container(name)['State']['Running']: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17') if docker_cli.inspect_container(name)['State']: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app(self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed def attach(self, name): """ Attach to a job's stdin, stdout and stderr """ raise NotImplementedError
class MarathonManager(object): def __init__(self, server): self._client = MarathonClient(server) def __repr__(self): return self.server def create(self, app): app._create(self._client) def discover(self, app_filter=None, env_filter=False): apps = set() for app in self._client.list_apps(): if not app_filter or ('_tonomi_application', app_filter) in app.labels.items(): if not env_filter: apps.add(reduce_app_name(app.id)) else: if '_tonomi_environment' in app.labels.keys(): env_name = app.labels['_tonomi_environment'] apps.add('/{}'.format(env_name)) return list(apps) def get_apps(self, app_type, env_name): env_name = env_name.replace('/', '') apps = [] for app in self._client.list_apps(): if ('_tonomi_environment', env_name) in app.labels.items() and ( '_tonomi_application', app_type) in app.labels.items(): apps.append(app) return [self._client.get_app(app.id) for app in apps] def get_app_host(self, app_type, env_name): while True: apps = self.get_apps(app_type=app_type, env_name=env_name) for app in apps: for task in app.tasks: host = task.host return host time.sleep(5) def health_check(self): pass def destroy(self, name): try: self._client.delete_group(name, force=True) except: pass def update(self): pass def restart(self): pass def scale_app(self, app_name, num): self._client.scale_app(app_name, num, force=True) def free_ports(self, num=1): return get_free_ports(self._client, num)
class MarathonHTTPClient(AbstractSchedulerClient): def __init__(self, target, auth, options, pkey): super(MarathonHTTPClient, self).__init__(target, auth, options, pkey) self.target = settings.MARATHON_HOST self.registry = settings.REGISTRY_HOST + ":" + settings.REGISTRY_PORT self.client = MarathonClient("http://" + self.target + ":8180") self.fleet = FleetHTTPClient("/var/run/fleet.sock", auth, options, pkey) # helpers def _app_id(self, name): return name.replace("_", ".") # container api def create(self, name, image, command="", **kwargs): """Create a new container""" app_id = self._app_id(name) l = locals().copy() l.update(re.match(MATCH, name).groupdict()) image = self.registry + "/" + image mems = kwargs.get("memory", {}).get(l["c_type"]) m = 0 if mems: mems = mems.lower() if mems[-2:-1].isalpha() and mems[-1].isalpha(): mems = mems[:-1] m = int(mems[:-1]) c = 0.5 cpu = kwargs.get("cpu", {}).get(l["c_type"]) if cpu: c = cpu cmd = "docker run --name {name} -P {image} {command}".format(**locals()) self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0)) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 0: return time.sleep(1) def start(self, name): """Start a container.""" self.client.scale_app(self._app_id(name), 1, force=True) for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: break time.sleep(1) host = self.client.get_app(self._app_id(name)).tasks[0].host self._waitforcontainer(host, name) def destroy(self, name): """Destroy a container.""" try: host = self.client.get_app(self._app_id(name)).tasks[0].host self.client.delete_app(self._app_id(name), force=True) self._delete_container(host, name) except: self.client.delete_app(self._app_id(name), force=True) def _get_container_state(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17") try: if docker_cli.inspect_container(name)["State"]["Running"]: return JobState.up except: return JobState.destroyed def _waitforcontainer(self, host, name): for _ in xrange(POLL_WAIT): if self._get_container_state(host, name) == JobState.up: return time.sleep(1) raise RuntimeError("App container Not Started") def _delete_container(self, host, name): docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17") if docker_cli.inspect_container(name)["State"]: docker_cli.remove_container(name, force=True) def run(self, name, image, entrypoint, command): # noqa """Run a one-off command.""" return self.fleet.run(name, image, entrypoint, command) def state(self, name): """Display the given job's running state.""" try: for _ in xrange(POLL_ATTEMPTS): if self.client.get_app(self._app_id(name)).tasks_running == 1: return JobState.up elif self.client.get_app(self._app_id(name)).tasks_running == 0: return JobState.created time.sleep(1) except: return JobState.destroyed
class Scaler: """Class for Scaling""" def __init__(self, app_name, config): self.logger = logging.getLogger("autoscaling") self.logger.setLevel(logging.DEBUG) self.logger.debug("Init object scaler...") self.config = config self.logger.debug("Connect RESTful mariadb and get policies...") conn = http.client.HTTPConnection(config["MARIA_RESTFUL"]['host'], config["MARIA_RESTFUL"]['port']) conn.request("GET", "/app/name/"+app_name) json_app = conn.getresponse().read().decode("utf-8") self.app = json.loads(json_app) conn.request("GET", "/app/name/"+app_name+"/policies") json_policies = conn.getresponse().read().decode("utf-8") self.app["policies"] = json.loads(json_policies) self.logger.debug("Connect influxdb and marathon...") self.influx_client = InfluxDBClient(config["INFLUXDB"]["host"], config["INFLUXDB"]["port"], config["INFLUXDB"]["username"], config["INFLUXDB"]["password"], config["INFLUXDB"]["db_name"]) self.marathon_client = MarathonClient('http://'+config["MARATHON"]['host']+':'+config["MARATHON"]['port']) self.app["instance"] = self.marathon_client.get_app(app_name).instances self.app["mem"] = self.marathon_client.get_app(app_name).mem self.app["cpus"] = self.marathon_client.get_app(app_name).cpus self.logger.debug("Reconfig haproxy.cfg...") os.system("sudo ./servicerouter.py --marathon http://"+config["MARATHON"]["host"]+":"+config["MARATHON"]["port"]+" --haproxy-config /etc/haproxy/haproxy.cfg") def setup_logging(self, log_file = "autoscaling.log", level = logging.INFO, formatter = None): if(formatter == None): formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(log_file) fh.setLevel(level) fh.setFormatter(formatter) self.logger.addHandler(fh) def get_cpu_usage(self, container_name): """Return cpu usage of container_name @param string container_name container name """ query = "select DERIVATIVE(cpu_cumulative_usage) as cpu_usage from stats where container_name = '"+container_name+"' and time > now()-5m group by time(2s) " result = self.influx_client.query(query) points = result[0]["points"] return (points[0][1]/1000000000/self.app["cpus"])*100 def get_container_name(self, mesos_task_id): """Return container name mapping with mesos_task_id in messos @param string mesos_task_id """ query = "select container_name from "+self.config["INFLUXDB"]["ts_mapping"]+" where time>now() - 5m and mesos_task_id = '" +mesos_task_id+"' limit 1" result = self.influx_client.query(query) points = result[0]["points"] return points[0][2] def get_containers_name(self): """Return list all containers name of application have name app_name @param string app_name name of application @return list all containers name of app_name """ tasks = self.marathon_client.list_tasks(self.app["name"]) containers_name = [] for task in tasks: containers_name.append(self.get_container_name(task.id)) return containers_name def avg_mem_usage(self, containers_name): """Return avg memmory usage of all containers in list containers_name @param list containers_name list containers name @return float avg mem usage """ number_container = len(containers_name) containers_name = ["'"+x+"'" for x in containers_name] containers_name = ",".join(containers_name) query = "select memory_usage,container_name from stats where time > now()-5m and container_name in ("+containers_name+") limit "+str(number_container*2) result = self.influx_client.query(query) points = result[0]["points"] sum_memory_usage = 0 for point in points: if(point[3] != None): sum_memory_usage += point[3]/(self.app["mem"]*1048576)*100 return sum_memory_usage / number_container def avg_cpu_usage(self, containers_name): """Return avg cpu usage of all containers in list containers_name @param list containers_name list containers name @return float avg cpu usage """ number_container = len(containers_name) containers_name = ["'"+x+"'" for x in containers_name] containers_name = ",".join(containers_name) query = "select DERIVATIVE(cpu_cumulative_usage) as cpu_usage,container_name from stats where time > now()-5m and container_name in ("+containers_name+") group by time(10s),container_name limit "+str(number_container) result = self.influx_client.query(query) points = result[0]["points"] sum_cpu_usage = 0 for point in points: sum_cpu_usage += point[1]/1000000000/self.app["cpus"]*100 return sum_cpu_usage / number_container def scale(self, delta): """sacle app_name (add or remove) delta intances @param string app_name name of application @param int delta number intances add or remove """ new_instance = self.app["instance"] + delta if(new_instance > self.app['max_instances']): new_instance = self.app['max_instances'] if(new_instance < self.app['min_instances']): new_instance = self.app['min_instances'] if(new_instance != self.app["instance"]): self.marathon_client.scale_app(self.app["name"], new_instance) self.logger.debug("Scaling "+self.app["name"]+" to: "+str(new_instance)) self.logger.debug("Waiting for config file haproxy.cfg...") time.sleep(self.config["TIME"]['w_config_ha']) self.logger.debug("Config file haproxy.cfg...") os.system("sudo ./servicerouter.py --marathon http://"+self.config["MARATHON"]["host"]+":"+self.config["MARATHON"]["port"]+" --haproxy-config /etc/haproxy/haproxy.cfg") self.app["instance"] =self.marathon_client.get_app(self.app["name"]).instances self.logger.debug("Sleep "+str(self.config["TIME"]['after_scale'])+"s...") time.sleep(self.config["TIME"]['after_scale']) def check_rule(self, policie, value): """Check rule and return number intances need scale @param models.Policie policies @param tuple value values of metric @return integer number intances need scale """ delta = {} delta["up"] = 0 delta["down"] = 0 # Check upper_threshold if(value[policie["metric_type"]] > policie["upper_threshold"]): delta['up'] = policie["instances_in"] # Check lower_threshold if(value[policie["metric_type"]] < policie["lower_threshold"]): delta['down'] = policie["instances_out"] return delta def autoscaling(self): while True: try: containers_name = self.get_containers_name() avg_cpu = self.avg_cpu_usage(containers_name) avg_mem = self.avg_mem_usage(containers_name) self.logger.info("Avg cpu usage, avg memmory usage, current instance: %f %f %d", avg_cpu, avg_mem, self.app["instance"]) rs_detal = {} rs_detal['up'] = 0 rs_detal['down'] = 10 for policie in self.app["policies"]: delta = self.check_rule(policie, (avg_cpu, avg_mem)) if(rs_detal['up'] < delta['up']): rs_detal['up'] = delta['up'] if(rs_detal['down'] > delta['down']): rs_detal['down'] = delta['down'] if(rs_detal['up'] > 0): self.scale(rs_detal['up']) elif(rs_detal['down'] > 0): self.scale(0-rs_detal['down']) except Exception as e: self.logger.debug(str(e)) finally: time.sleep(self.config["TIME"]['monitor'])
class MarathonSpawner(Spawner): app_image = Unicode("jupyterhub/singleuser:%s" % _jupyterhub_xy, config=True) app_prefix = Unicode("jupyter", help=dedent(""" Prefix for app names. The full app name for a particular user will be <prefix>/<username>/notebook. """)).tag(config=True) marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) unreachable_strategy = Any( None, help='Unreachable strategy to be passed through to Marathon').tag( config=True) volumes = List([], help=dedent(""" A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """)).tag(config=True) max_cpu = Float(2, config=True) cpu = Float(1, config=True) max_mem = Float(4096, config=True) mem = Float(1024, config=True) max_disk = Float(20000, config=True) disk = Float(5000, config=True) max_gpu = Integer(0, config=True) gpu = Integer(0, config=True) mesos_user = Unicode(None, config=True, allow_none=True) autotimeout = Integer( None, help="Seconds to automatically timeout unused notebook servers", config=True, allow_none=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub").tag(config=True) @observe('hub_ip_connect') def _ip_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_ip_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_ip instead.", DeprecationWarning, ) hub_port_connect = Integer(-1, help="Public PORT of the hub").tag(config=True) @observe('hub_port_connect') def _port_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_port_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_port instead.", DeprecationWarning, ) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name # fix default port to 8888, used in the container @default('port') def _port_default(self): return 8888 # default to listening on all-interfaces in the container @default('ip') def _ip_default(self): return '0.0.0.0' _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(5) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host) self.get_state() @property def app_id(self): return '/%s/%s/notebook' % (self.app_prefix, self.user.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['user_options'] = self.stored_user_options = self.user_options return state def load_state(self, state): super(MarathonSpawner, self).load_state(state) self.stored_user_options = state.get('user_options', {}) def get_health_checks(self): health_checks = [] health_checks.append( MarathonHealthCheck(protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=30, timeout_seconds=20, max_consecutive_failures=0)) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name( mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name( mv.external['name'], self) volumes.append(mv) return volumes def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) return constraints def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) return (ip, app_info.tasks[0].ports[0]) @run_on_executor def get_app_info(self, app_id): try: app = self.marathon.get_app(app_id, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_id) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse((uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment)) def get_args(self): args = super().get_args() if self.hub_ip_connect: # JupyterHub 0.7 specifies --hub-api-url # on the command-line, which is hard to update for idx, arg in enumerate(list(args)): if arg.startswith('--hub-api-url='): args.pop(idx) break args.append('--hub-api-url=%s' % self._public_hub_api_url()) for idx, arg in enumerate(list(args)): if arg.startswith('--port='): args.pop(idx) break args.append('--port=$PORT0') return args def options_from_form(self, formdata): options = {} options['app_image'] = formdata['app_image'][0] or None if 'force_pull_image' in formdata: options['force_pull_image'] = formdata['force_pull_image'][ 0] == 'on' options['cpu'] = float(formdata['cpu'][0]) options['mem'] = float(formdata['mem'][0]) options['disk'] = float(formdata['disk'][0]) if formdata.get('gpu', None): options['gpu'] = int(formdata['gpu'][0]) return options @property def options_form(self): template = """ <div class="form-group"> <label for="app_image">Image <span class="label label-default">Optional</span></label> <input id="app_image" class="form-control" name="app_image" type="text" placeholder="e.g. %(default_app_image)s" value="%(app_image)s" /> </div> <div class="checkbox"> <label for="force_pull_image"> <input id="force_pull_image" name="force_pull_image" type="checkbox" value="on" /> Force pull image </label> </div> <div class="form-group"> <div class="row"> <div class="col-sm-4"> <label for="cpu">CPU</label> <input id="cpu" class="form-control" name="cpu" type="number" step="any" value="%(cpu)s" min="%(min_cpu)s" max="%(max_cpu)s" required /> </div> <div class="col-sm-4"> <label for="mem">Mem (MiB)</label> <input id="mem" class="form-control" name="mem" type="number" step="any" value="%(mem)s" min="%(min_mem)s" max="$(max_mem)s" required /> </div> <div class="col-sm-4"> <label for="disk">Disk (MiB)</label> <input id="disk" class="form-control" name="disk" type="number" step="any" value="%(disk)s" min="%(min_disk)s" max="%(max_disk)s" required /> </div> </div> </div> """ % { 'default_app_image': self.app_image, 'app_image': self.stored_user_options.get('app_image', None) or '', 'min_cpu': 0.001, 'max_cpu': self.max_cpu, 'cpu': remove_zeros(str(self.stored_user_options.get('cpu', self.cpu))), 'min_mem': 32, 'max_mem': self.max_mem, 'mem': remove_zeros(str(self.stored_user_options.get('mem', self.mem))), 'min_disk': 1000, 'max_disk': self.max_disk, 'disk': remove_zeros(str(self.stored_user_options.get('disk', self.disk))), } if self.max_gpu > 0: template += """ <div class="form-group"> <div class="row"> <div class="col-sm-4"> <label for="gpu">GPU</label> <input id="gpu" class="form-control" name="gpu" type="number" step="1" value="%(gpu)s" min="%(min_gpu)s" max="%(max_gpu)s" required /> </div> </div> </div> """ % { 'min_gpu': 0, 'max_gpu': self.max_gpu, 'gpu': self.stored_user_options.get('gpu', self.gpu), } return """<div>%s</div>""" % template @gen.coroutine def start(self): app_image = self.user_options.get('app_image', None) or self.app_image force_pull_image = self.user_options.get('force_pull_image', False) self.log.info("starting a Marathon app with image=%s" % app_image) container_params = { 'image': app_image, 'force_pull_image': force_pull_image } docker_container = MarathonDockerContainer(**container_params) app_container = MarathonContainer(docker=docker_container, type='MESOS', volumes=self.get_volumes()) cpu = self.user_options.get('cpu', None) mem = self.user_options.get('mem', None) disk = self.user_options.get('disk', None) gpu = self.user_options.get('gpu', None) self.log.info("resource: (cpu=%s, mem=%s, disk=%s, gpu=%s)" % (cpu, mem, disk, gpu)) cmd = self.cmd + self.get_args() env = self.get_env() port_definitions = [PortDefinition(port=0, protocol='tcp')] app_request = MarathonApp( id=self.app_id, cmd=' '.join( cmd), # cmd does not use Docker image's default entrypoint env=env, cpus=cpu, mem=mem, disk=disk, gpus=gpu, user=self.mesos_user, container=app_container, port_definitions=port_definitions, networks=[{ 'mode': 'host' }], constraints=self.get_constraints(), health_checks=self.get_health_checks(), unreachable_strategy=self.unreachable_strategy, instances=1) app_info = self.get_app_info(self.app_id) try: if app_info: self.marathon.update_app(self.app_id, app_request, force=True) else: self.marathon.create_app(self.app_id, app_request) except Exception as e: self.log.error("Failed to create application for %s: %s", self.app_id, e) raise e while True: app_info = yield self.get_app_info(self.app_id) if app_info is None: raise MarathonSpawnerException("Application %s is lost", self.app_id) elif app_info.instances == 0: raise MarathonSpawnerException( "No instance for application %s", self.app_id) elif app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: self.marathon.update_app(self.app_id, MarathonApp(instances=0), force=True) except Exception as e: self.log.error("Failed to delete application %s", self.app_id) raise e else: if not now: while True: app_info = yield self.get_app_info(self.app_id) if app_info is None: # Stopping application is lost, just ignore it! break elif len(app_info.deployments) == 0: # This is the success case. break yield gen.sleep(1) @gen.coroutine def poll(self): app_info = yield self.get_app_info(self.app_id) if app_info is None: self.log.error("Application %s is lost", self.app_id) return 3 for deployment in app_info.deployments: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.app_id) return 1 if app_info.tasks_healthy == 0: self.log.error("No healthy instance for application %s", self.app_id) return 2 if self.autotimeout is not None: tm_diff = datetime.utcnow() - self.user.last_activity self.log.debug("Application %s is inactive for %d sec", self.app_id, tm_diff.seconds) if tm_diff > timedelta(seconds=self.autotimeout): self.log.info( "Stopping application %s because it's inactive for more than %d sec", self.app_id, self.autotimeout) # Do not yield the result of stop here self.stop() return 0 return None
userid = os.getenv('MARATHON_USER') or input('Enter the username for the DCOS cluster: ') password = os.getenv('MARATHON_PWD') or input('Enter the password for the DCOS cluster: ') marathon_app = os.getenv('APP_NAME') or input("Enter the Marathon Application Name to scale (eg: /worker): ") redis_uri = os.getenv('REDIS_URI') or input("Enter the Redis URI, including password (eg: redis://localhost:8999/2): ") max_instances = os.getenv('MAX_INSTANCES') or 10 min_instances = os.getenv('MIN_INSTANCES') or 1 c = MarathonClient(dcos_master, username=userid, password=password) r = StrictRedis.from_url(redis_uri) while True: print("Loop!") app = None while app is None: try: app = c.get_app(marathon_app) except MarathonError as err: print(err) app = None time.sleep(1) waitingDocs = r.llen("celery") instances = app.instances if waitingDocs == instances: pass elif waitingDocs > instances: instances += 1 else: instances -= 1
class MarathonSpawner(Spawner): # Load the app image app_image = Unicode("jupyterhub/singleuser", config=True) # The command to run app_cmd = Unicode("jupyter notebook", config=True) # This is the prefix in Marathon app_prefix = Unicode( "jupyter", help=dedent( """ Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """ ) ).tag(config=True) user_web_port = Integer(0, help="Port that the Notebook is listening on").tag(config=True) user_ssh_port = Integer(0, help="SSH Port that the container is listening on").tag(config=True) user_ssh_host = Unicode('', help="Hostname of the ssh container").tag(config=True) use_jupyterlab = Integer(0, help="Use Jupyterlab - Jupyterlab is 1 default is 0 or Jupyternotebook").tag(config=True) user_ssh_hagroup = Unicode('', help="HAProxy group for ssh container port").tag(config=True) # zeta_user_file are the users and their custom settings for installation in Zeta Architechure. If this is blank, defaults from Jupyter Hub are used for Mem, CPU, Ports, Image. If this is not blank, we will read from that file zeta_user_file = Unicode( "", help="Path to json file that includes users and per user settings" ).tag(config=True) no_user_file_fail = Bool( True, help="Is zeta_user_file is provided, but can't be opened fail. (Default). False loads defaults and tries to spawn" ).tag(config=True) # Marathon Server marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_user_name = Unicode( u'', help='Marathon user name' ).tag(config=True) marathon_user_password = Unicode( u'', help='Marathon user password' ).tag(config=True) fetch = List([], help='Optional files to fetch').tag(config=True) custom_env = List( [], help='Additional ENVs to add to the default. Format is a list of 1 record dictionary. [{key:val}]' ).tag(config=True) # Constraints in Marathon marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) # Shared Notebook location shared_notebook_dir = Unicode( '', help="Shared Notebook location that users will get a link to in their notebook location - can be blank" ).tag(config=True) ports = List( [8888], help='Ports to expose externally' ).tag(config=True) volumes = List( [], help=dedent( """ A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """ ) ).tag(config=True) network_mode = Unicode( 'BRIDGE', help="Enum of BRIDGE or HOST" ).tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub" ).tag(config=True) hub_port_connect = Integer( -1, help="Public PORT of the hub" ).tag(config=True) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """ ).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host, self.marathon_user_name, self.marathon_user_password) @property def container_name(self): self.log.info("Container Name : %s / %s / %s",self.app_prefix, self.user.name, self.name ) try: self.log.info("Debug %s", json.dumps(self.name)) except: self.log.info("Could not log self") return '/%s/%s%s' % (self.app_prefix, self.user.name, self.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append(MarathonHealthCheck( protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=60, timeout_seconds=20, max_consecutive_failures=0 )) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name(mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name(mv.external['name'], self) volumes.append(mv) out_vols = [] dups = {} #Remove Duplicates there should be only one container path point for container for x in volumes: if x.container_path in dups: pass else: out_vols.append(x) dups[x.container_path] = 1 return out_vols def get_app_cmd(self): retval = self.app_cmd.replace("{username}", self.user.name) retval = retval.replace("{userwebport}", str(self.user_web_port)) if self.use_jupyterlab == 1: print("This is where I should do some thing if I want to run Jupyter lab") if self.user_ssh_hagroup != "": retval = retval.replace("{usersshport}", "$PORT0") else: retval = retval.replace("{usersshport}", str(self.user_ssh_port)) return retval def get_port_mappings(self): port_mappings = [] if self.network_mode == "BRIDGE": for p in self.ports: port_mappings.append( MarathonContainerPortMapping( container_port=p, host_port=0, protocol='tcp' ) ) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) port = app_info.tasks[0].ports[0] return (ip, port) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse(( uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment )) def get_env(self): env = super(MarathonSpawner, self).get_env() env.update(dict( # Jupyter Hub config JPY_USER=self.user.name, # JPY_COOKIE_NAME=self.user.server.cookie_name, # JPY_BASE_URL=self.user.server.base_url, JPY_HUB_PREFIX=self.hub.server.base_url, JPY_USER_WEB_PORT=str(self.user_web_port), JPY_USER_SSH_PORT=str(self.user_ssh_port), JPY_USER_SSH_HOST=str(self.user_ssh_host) )) if self.notebook_dir: env['NOTEBOOK_DIR'] = self.notebook_dir if self.hub_ip_connect or self.hub_port_connect > 0: hub_api_url = self._public_hub_api_url() else: hub_api_url = self.hub.api_url env['JPY_HUB_API_URL'] = hub_api_url for x in self.custom_env: for k,v in x.items(): env[k] = str(v) return env def update_users(self): # No changes if the zeta_user_file is blank if self.zeta_user_file != "": try: j = open(self.zeta_user_file, "r") user_file = j.read() j.close() user_ar = {} for x in user_file.split("\n"): if x.strip().find("#") != 0 and x.strip() != "": y = json.loads(x) if y['user'] == self.user.name: user_ar = y break if len(user_ar) == 0: self.log.error("Could not find current user %s in zeta_user_file %s - Not Spawning" % (self.user.name, self.zeta_user_file)) if self.no_user_file_fail == True: raise Exception('no_user_file_fail is True, will not go on') print("User List identified and loaded, setting values to %s" % user_ar) self.cpu_limit = user_ar['cpu_limit'] self.mem_limit = user_ar['mem_limit'] self.user_ssh_port = user_ar['user_ssh_port'] self.user_web_port = user_ar['user_web_port'] self.user_ssh_host = user_ar['user_ssh_host'] try: self.user_ssh_hagroup = user_ar['user_ssh_hagroup'] except: self.user_ssh_hagroup = "" try: self.use_jupyterlab = int(user_ar['use_jupyterlab']) except: self.use_jupyterlab = 0 self.network_mode = user_ar['network_mode'] self.app_image = user_ar['app_image'] self.marathon_constraints = user_ar['marathon_constraints'] self.ports.append(self.user_web_port) self.ports.append(self.user_ssh_port) self.custom_env = self.custom_env + user_ar['custom_env'] self.volumes = self.volumes + user_ar['volumes'] print("User List Loaded!") # { "user": "******", "cpu_limit": "1", "mem_limit": "2G", "user_ssh_port": 10500, "user_web_port:" 10400, "network_mode": "BRIDGE", "app_image": "$APP_IMG", "marathon_constraints": []} except: self.log.error("Could not find or open zeta_user_file: %s" % self.zeta_user_file) if self.no_user_file_fail == True: raise Exception("Could not open file and config says don't go on") @gen.coroutine def start(self): # First make a quick call to determine if user info was updated self.update_users() # Go on to start the notebook docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer( docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 if self.user_ssh_hagroup != "": myports = [self.user_ssh_port] labels = {"HAPROXY_GROUP": self.user_ssh_hagroup, "HA_EDGE_CONF": "1"} else: labels = {} myports = [] app_request = MarathonApp( id=self.container_name, cmd=self.get_app_cmd(), env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1, labels=labels, ports=myports, fetch=self.fetch, ) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment(status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
class MarathonSpawner(Spawner): app_image = Unicode("jupyterhub/singleuser:%s" % _jupyterhub_xy, config=True) app_prefix = Unicode("jupyter", help=dedent(""" Prefix for app names. The full app name for a particular user will be <prefix>/<username>. """)).tag(config=True) marathon_host = Unicode( u'', help="Hostname of Marathon server").tag(config=True) marathon_constraints = List( [], help='Constraints to be passed through to Marathon').tag(config=True) ports = List([8888], help='Ports to expose externally').tag(config=True) volumes = List([], help=dedent(""" A list in Marathon REST API format for mounting volumes into the docker container. [ { "containerPath": "/foo", "hostPath": "/bar", "mode": "RW" } ] Note that using the template variable {username} in containerPath, hostPath or the name variable in case it's an external drive it will be replaced with the current user's name. """)).tag(config=True) network_mode = Unicode('BRIDGE', help="Enum of BRIDGE or HOST").tag(config=True) hub_ip_connect = Unicode( "", help="Public IP address of the hub").tag(config=True) @observe('hub_ip_connect') def _ip_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_ip_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_ip instead.", DeprecationWarning, ) hub_port_connect = Integer(-1, help="Public PORT of the hub").tag(config=True) @observe('hub_port_connect') def _port_connect_changed(self, change): if jupyterhub.version_info >= (0, 8): warnings.warn( "MarathonSpawner.hub_port_connect is no longer needed with JupyterHub 0.8." " Use JupyterHub.hub_connect_port instead.", DeprecationWarning, ) format_volume_name = Any( help="""Any callable that accepts a string template and a Spawner instance as parameters in that order and returns a string. """).tag(config=True) @default('format_volume_name') def _get_default_format_volume_name(self): return default_format_volume_name # fix default port to 8888, used in the container @default('port') def _port_default(self): return 8888 # default to listening on all-interfaces in the container @default('ip') def _ip_default(self): return '0.0.0.0' _executor = None @property def executor(self): cls = self.__class__ if cls._executor is None: cls._executor = ThreadPoolExecutor(1) return cls._executor def __init__(self, *args, **kwargs): super(MarathonSpawner, self).__init__(*args, **kwargs) self.marathon = MarathonClient(self.marathon_host) @property def container_name(self): return '/%s/%s' % (self.app_prefix, self.user.name) def get_state(self): state = super(MarathonSpawner, self).get_state() state['container_name'] = self.container_name return state def load_state(self, state): if 'container_name' in state: pass def get_health_checks(self): health_checks = [] health_checks.append( MarathonHealthCheck(protocol='TCP', port_index=0, grace_period_seconds=300, interval_seconds=30, timeout_seconds=20, max_consecutive_failures=0)) return health_checks def get_volumes(self): volumes = [] for v in self.volumes: mv = MarathonContainerVolume.from_json(v) mv.container_path = self.format_volume_name( mv.container_path, self) mv.host_path = self.format_volume_name(mv.host_path, self) if mv.external and 'name' in mv.external: mv.external['name'] = self.format_volume_name( mv.external['name'], self) volumes.append(mv) return volumes def get_port_mappings(self): port_mappings = [] for p in self.ports: port_mappings.append( MarathonContainerPortMapping(container_port=p, host_port=0, protocol='tcp')) return port_mappings def get_constraints(self): constraints = [] for c in self.marathon_constraints: constraints.append(MarathonConstraint.from_json(c)) return constraints @run_on_executor def get_deployment(self, deployment_id): deployments = self.marathon.list_deployments() for d in deployments: if d.id == deployment_id: return d return None @run_on_executor def get_deployment_for_app(self, app_name): deployments = self.marathon.list_deployments() for d in deployments: if app_name in d.affected_apps: return d return None def get_ip_and_port(self, app_info): assert len(app_info.tasks) == 1 ip = socket.gethostbyname(app_info.tasks[0].host) return (ip, app_info.tasks[0].ports[0]) @run_on_executor def get_app_info(self, app_name): try: app = self.marathon.get_app(app_name, embed_tasks=True) except NotFoundError: self.log.info("The %s application has not been started yet", app_name) return None else: return app def _public_hub_api_url(self): uri = urlparse(self.hub.api_url) port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname return urlunparse((uri.scheme, '%s:%s' % (ip, port), uri.path, uri.params, uri.query, uri.fragment)) def get_args(self): args = super().get_args() if self.hub_ip_connect: # JupyterHub 0.7 specifies --hub-api-url # on the command-line, which is hard to update for idx, arg in enumerate(list(args)): if arg.startswith('--hub-api-url='): args.pop(idx) break args.append('--hub-api-url=%s' % self._public_hub_api_url()) return args @gen.coroutine def start(self): docker_container = MarathonDockerContainer( image=self.app_image, network=self.network_mode, port_mappings=self.get_port_mappings()) app_container = MarathonContainer(docker=docker_container, type='DOCKER', volumes=self.get_volumes()) # the memory request in marathon is in MiB if hasattr(self, 'mem_limit') and self.mem_limit is not None: mem_request = self.mem_limit / 1024.0 / 1024.0 else: mem_request = 1024.0 cmd = self.cmd + self.get_args() app_request = MarathonApp(id=self.container_name, cmd=' '.join(cmd), env=self.get_env(), cpus=self.cpu_limit, mem=mem_request, container=app_container, constraints=self.get_constraints(), health_checks=self.get_health_checks(), instances=1, accepted_resource_roles=['*']) self.log.info("Creating App: %s", app_request) self.log.info("self.marathon: %s", self.marathon) app = self.marathon.create_app(self.container_name, app_request) if app is False or app.deployments is None: self.log.error("Failed to create application for %s", self.container_name) self.log.error("app: %s", app) return None while True: app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: ip, port = self.get_ip_and_port(app_info) break yield gen.sleep(1) return (ip, port) @gen.coroutine def stop(self, now=False): try: status = self.marathon.delete_app(self.container_name) except: self.log.error("Could not delete application %s", self.container_name) raise else: if not now: while True: deployment = yield self.get_deployment( status['deploymentId']) if deployment is None: break yield gen.sleep(1) @gen.coroutine def poll(self): deployment = yield self.get_deployment_for_app(self.container_name) if deployment: for current_action in deployment.current_actions: if current_action.action == 'StopApplication': self.log.error("Application %s is shutting down", self.container_name) return 1 return None app_info = yield self.get_app_info(self.container_name) if app_info and app_info.tasks_healthy == 1: return None return 0
def get_instances_amount(client: MarathonClient, appid: str) -> int: try: return client.get_app(appid).instances except MarathonHttpError: return -1
class MarathonIF(object): def __init__(self, marathon_addr, my_addr, mesos): self.mcli = MarathonClient(marathon_addr) self.myAddr = my_addr self.mesos = mesos def get_apps(self): listapps = self.mcli.list_apps() return listapps def get_app(self, app_id, timeout=300): st_time = time.time() while(time.time() - st_time < timeout): try: try: a = self.mcli.get_app(app_id) except marathon.exceptions.NotFoundError as e: # NOQA return None return a except: l.info("mcli: get_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli get_app timed out, possible zookeper/marathon/mesos malfunction") def delete_app(self, app_id, force=False, timeout=200): st_time = time.time() while(time.time() - st_time < timeout): try: self.mcli.delete_app(app_id, force) return except: l.info("mcli: delete_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli delete_app timed out, possible zookeper/marathon/mesos malfunction") def delete_deployment(self, dep_id): return self.mcli.delete_deployment(dep_id) def get_deployments(self): return self.mcli.list_deployments() def delete_app_ifexisting(self, app_id, trys=4): for idx in range(0, trys): try: a = self.get_app(app_id) if a: return self.delete_app(app_id) return None except: e = sys.exc_info()[0] pprint("<p>Error: %s</p>" % e) time.sleep(10) raise @staticmethod def is_valid_app_id(app_id): # allowed: lowercase letters, digits, hyphens, slash, dot if re.match("^[A-Za-z0-9-/.]*$", app_id): return True return False def create_app(self, app_id, attr): """ Create and start an app. :param app_id: (str) - Application ID :param attr: marathon.models.app.MarathonApp application to create. :return: the created app """ # Validate that app_id conforms to allowed naming scheme. if not self.is_valid_app_id(app_id): l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id) raise Exception("Invalid app_id") for idx in range(0, 10): try: a = self.mcli.create_app(app_id, attr) return a except marathon.exceptions.MarathonHttpError as e: if str(e).find('App is locked by one or more deployments. Override with the option') >= 0: time.sleep(1) else: raise raise def wait_app_removal(self, app): cnt = 0 while True: if not self.get_app(app): break time.sleep(0.2) cnt += 1 if cnt > 0: l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt)) return True def wait_app_ready(self, app, running_count, sleep_before_next_try=1): cnt = 0 while True: a1 = self.get_app(app) # if tasks_running are greater (due to whatever reason, scale down accordingly) if a1.tasks_running > running_count: delta = a1.tasks_running - running_count l.info("Found [%d] more apps, scaling down to [%d]", delta, running_count) self.scale_app(app, running_count) # Allow for some time before next poll time.sleep(1) continue if a1.tasks_running == running_count: return a1 cnt += 1 time.sleep(sleep_before_next_try) if (cnt % 30) == 29: l.info("[%d]Waiting for task to move to running stage, " % cnt + "current stat staged=%d running=%d expected Running=%d" % (a1.tasks_staged, a1.tasks_running, running_count)) def scale_app(self, app, scale, timeout=300): st_time = time.time() while(time.time() - st_time < timeout): try: self.mcli.scale_app(app, scale) return except: l.info("mcli: scale_app returned error") l.info(traceback.format_exc()) l.info("Retrying after 10 secs timeout=%d", timeout) time.sleep(10) raise Exception("mcli scale_app timed out, possible zookeper/marathon/mesos malfunction") def ping(self): return self.mcli.ping() def kill_task(self, app_id, task_id): return self.mcli.kill_task(app_id, task_id)
def deploy(app_definition, marathon_url, instances, auth_token, zero, force): old_appids = [] # Connect to Marathon print("\nConnecting to Marathon...") c = MarathonClient(marathon_url, auth_token=auth_token) print("Connected to", marathon_url) # Pick up the Marathon App Definition file app_json = open(app_definition).read() app = MarathonApp.from_json(json.loads(app_json)) new_app_id = app.id service_name = new_app_id.split("/")[-1].split(".")[0] # Instantiate the new application on DC/OS but don't launch it yet # The application definition instances field should be 0 by default # If forced, the application will be relaunched even if the ID already exists print("\nInstantiating new application on Marathon with", app.instances, "instances...") try: c.create_app(new_app_id, app) except: if force == 'Yes': print("\nForcing redeploy of the same app id...", new_app_id) c.update_app(new_app_id, app, force=True, minimal=True) check_deployment(c, new_app_id) pass else: sys.exit() print("Created app", new_app_id) # List and find currently running apps of the same service # This assumes the naming convention (id): /some/group/service_name.uniquevalue print("\nFinding any existing apps for service:", service_name) for app in c.list_apps(): existing_service_name = app.id.split("/")[-1].split(".")[0] if (service_name == existing_service_name) and app.instances > 0: print("Found up and running application id:", app.id) old_appids.append(app.id) # If it's the first deployment ever, just launch the desired number of instances # Otherwise perform a hybrid release # Finally clean up any older app instances running if not old_appids: if instances is None: instances = 2 print("No current apps found. Launching brand new service with", instances, "instances...") c.scale_app(new_app_id, instances=instances) check_deployment(c, new_app_id) check_health(c, new_app_id) else: old_appids.reverse() if zero == 'Yes': print("\nStarting zero downtime deployment for...", new_app_id) for old_appid in old_appids: if instances is None: instances = c.get_app(old_appid).instances if (old_appid == '' or old_appid == new_app_id or old_appid == '/' + new_app_id): print("Scaling existing app_id", new_app_id, "to", instances, "instances...") c.scale_app(new_app_id, instances=instances) check_deployment(c, new_app_id) check_health(c, new_app_id) else: print("Target number of total instances:", instances) delta = int(round(instances * .50)) delta = (delta if delta > 0 else 1) scale(c, new_app_id, old_appid, delta) if (c.get_app(new_app_id).instances != instances): print("\nLaunch", instances - delta, "remaining instance(s) of the new version...") c.scale_app(new_app_id, instances=instances) check_deployment(c, new_app_id) check_health(c, new_app_id) if (c.get_app(old_appid).instances > 0): print( "Finish shutting down remaining instances of the old version..." ) c.scale_app(old_appid, instances=0) check_deployment(c, old_appid) else: print("Started deployment with downtime...") for old_appid in old_appids: c.scale_app(old_appid, instances=0) check_deployment(c, old_appid) c.scale_app(new_app_id, instances=instances) check_deployment(c, new_app_id) check_health(c, new_app_id) print("\nSUCCESS:\nNew application ID:", new_app_id, "\nRunning instances:", instances)
def get_app_port(app_id): json_result = MarathonClient.get_app(app_id) port = json_result['apps'][0]["ports"][0] return port