def ip_address(self): """Getter for the service IP address, queries Swarm as the IP address changes outside our control.""" if self.docker_status != self.DOCKER_START_STATUS: return {} swarm = SwarmClient(get_conf()) s_info = swarm.inspect_container(self.docker_id) return s_info['ip_address'][get_conf().overlay_network_name]
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list( id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call( start_time, message['command'])
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format(message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format(message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list(id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list(id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call(start_time, message['command'])
class PlatformManager: """ :type swarm: SwarmClient :type scheduler: ZoeScheduler :type state_manager: StateManager """ def __init__(self, sched_policy_class): self.swarm = SwarmClient(get_conf()) self.scheduler = ZoeScheduler(self, sched_policy_class) self.state_manager = None def execution_submitted(self, execution: execution_module.Execution): execution.set_scheduled() self.scheduler.incoming(execution) def execution_start(self, execution: execution_module.Execution) -> bool: try: self._application_to_containers(execution) except ZoeException: self.execution_terminate(execution, reason='error') raise execution.set_started() self.state_manager.state_updated() def _application_to_containers(self, execution: execution_module.Execution): for process in execution.application.processes: self._spawn_process(execution, process) def _spawn_process(self, execution: execution_module.Execution, process_description: application_module.Process) -> bool: copts = ContainerOptions() copts.name = get_conf().container_name_prefix + '-' + process_description.name + "-{}".format(execution.id) copts.set_memory_limit(process_description.required_resources['memory']) copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, execution.owner.id) container_id = self.state_manager.gen_id() copts.labels = { 'zoe.{}'.format(get_conf().container_name_prefix): '', 'zoe.execution.id': str(execution.id), 'zoe.execution.name': execution.name, 'zoe.container.id': str(container_id), 'zoe.container.name': process_description.name, 'zoe.owner': execution.owner.name, 'zoe.prefix': get_conf().container_name_prefix, 'zoe.type': 'app_process' } if process_description.monitor: copts.labels['zoe.monitor'] = '' else: copts.labels['zoe.normal'] = '' copts.restart = not process_description.monitor # Monitor containers should not restart # Generate a dictionary containing the current cluster status (before the new container is spawned) # This information is used to substitute template strings in the environment variables subst_dict = { "execution_id": str(execution.id), "user_id": str(execution.owner.id), 'user_name': execution.owner.name, 'name_prefix': get_conf().container_name_prefix } for env_name, env_value in process_description.environment: try: env_value = env_value.format(**subst_dict) except KeyError: raise ZoeException("cannot find variable to substitute in expression {}".format(env_value)) copts.add_env_variable(env_name, env_value) # The same dictionary is used for templates in the command if process_description.command is not None: copts.set_command(process_description.command.format(**subst_dict)) cont_info = self.swarm.spawn_container(process_description.docker_image, copts) container = container_module.Container(self.state_manager) container.docker_id = cont_info["docker_id"] container.ip_address = cont_info["ip_address"] container.name = copts.name container.is_monitor = process_description.monitor container.ports = [p.to_dict() for p in process_description.ports] container.id = container_id execution.containers.append(container) container.execution = execution self.swarm.connect_to_network(container.docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792') self.state_manager.new('container', container) return True def execution_terminate(self, execution: execution_module.Execution, reason): """ :param execution: The execution to be terminated :param reason: termination reason :return: """ logs = [] if len(execution.containers) > 0: containers = execution.containers.copy() for c in containers: assert isinstance(c, container_module.Container) l = self.log_get(c.id) if l is not None: logs.append((c.name, l)) self.swarm.terminate_container(c.docker_id, delete=True) self.state_manager.delete('container', c.id) log.info('Container {} terminated'.format(c.name)) execution.store_logs(logs) if reason == 'error': execution.set_error() elif reason == 'finished': execution.set_finished() else: execution.set_terminated() self.scheduler.execution_terminate(execution) def start_gateway_container(self, user): copts = ContainerOptions() copts.name = '{}-gateway-{}'.format(get_conf().container_name_prefix, user.id) copts.network_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id) copts.ports.append(1080) copts.labels = { 'zoe.{}.gateway'.format(get_conf().container_name_prefix): '', 'zoe.owner': user.name, 'zoe.prefix': get_conf().container_name_prefix, 'zoe.type': 'gateway' } copts.restart = True if user.role == 'guest': image = get_conf().private_registry + '/zoerepo/guest-gateway' else: image = get_conf().private_registry + '/zoerepo/guest-gateway' # TODO: create an image with ssh cont_info = self.swarm.spawn_container(image, copts) if cont_info is None: raise ZoeException('Cannot create user gateway container') user.gateway_docker_id = cont_info['docker_id'] user.set_gateway_urls(cont_info) self.swarm.connect_to_network(user.gateway_docker_id, 'eeef9754c16790a29d5210c5d9ad8e66614ee8a6229b6dc6f779019d46cec792') def kill_gateway_container(self, user): self.swarm.terminate_container(user.gateway_docker_id, delete=True) user.gateway_docker_id = None user.gateway_urls = [] def create_user_network(self, user): log.info('Creating a new network for user {}'.format(user.id)) net_name = '{}-usernet-{}'.format(get_conf().container_name_prefix, user.id) net_id = self.swarm.network_create(net_name) user.network_id = net_id def remove_user_network(self, user): log.info('Removing network for user {}'.format(user.name)) self.swarm.network_remove(user.network_id) def log_get(self, container_id: int) -> str: container = self.state_manager.get_one('container', id=container_id) if container is None: return '' else: return self.swarm.log_get(container.docker_id) def container_stats(self, container_id: int) -> ContainerStats: container = self.state_manager.get_one('container', id=container_id) return self.swarm.stats(container.docker_id) def is_container_alive(self, container: container_module.Container) -> bool: ret = self.swarm.inspect_container(container.docker_id) if ret is None: return False return ret["running"] def swarm_stats(self) -> SwarmStats: # TODO implement some caching return self.swarm.info() def scheduler_stats(self) -> SchedulerStats: return self.scheduler.scheduler_policy.stats() def check_state_swarm_consistency(self): state_changed = False users = self.state_manager.get('user') networks = self.swarm.network_list('{}-usernet-'.format(get_conf().container_name_prefix)) gateways = self.swarm.list(['zoe.{}.gateway'.format(get_conf().container_name_prefix)]) users_no_network = [] users_no_gateway = [] networks_to_delete = [] gateways_to_delete = [] for u in users: if u.network_id is None: log.error('state inconsistency: user {} has no network'.format(u.name)) users_no_network.append(u) elif u.network_id not in [x['id'] for x in networks]: log.error('state inconsistency: user {} has an invalid network'.format(u.name)) u.network_id = None users_no_network.append(u) if u.gateway_docker_id is None: log.error('state inconsistency: user {} has no gateway'.format(u.name)) users_no_gateway.append(u) elif u.gateway_docker_id not in [x['id'] for x in gateways]: log.error('state inconsistency: user {} has an invalid gateway container ID'.format(u.name)) u.gateway_docker_id = None users_no_gateway.append(u) duplicate_check = set() for n in networks: try: uid = int(n['name'][len('{}-usernet-'.format(get_conf().container_name_prefix)):]) except ValueError: log.error('network {} does not belong to Zoe, bug?'.format(n['name'])) networks_to_delete.append(n['id']) continue if uid in duplicate_check: log.warning('state inconsistency: found two networks for the same user') networks_to_delete.append(n['id']) continue duplicate_check.add(uid) user = self.state_manager.get_one('user', id=uid) if user is not None and user in users_no_network: user.network_id = n['id'] users_no_network.remove(user) log.error('fixed: user {} linked to network {}'.format(user.name, n['name'])) state_changed = True continue elif user is None: log.error('state inconsistency: found a network for user {} who no longer exists'.format(uid)) networks_to_delete.append(n['id']) for g in gateways: try: uid = int(g['name'][len('{}-gateway-'.format(get_conf().container_name_prefix)):]) except ValueError: log.error('container {} does not belong to Zoe, bug?'.format(g['name'])) gateways_to_delete.append(g['id']) continue user = self.state_manager.get_one('user', id=uid) if user is not None and user in users_no_gateway: user.gateway_docker_id = g['id'] users_no_gateway.remove(user) cont_info = self.swarm.inspect_container(g['id']) user.set_gateway_urls(cont_info) log.error('fixed: user {} linked to gateway {}'.format(user.name, g['name'])) state_changed = True continue elif user is None: log.error('state inconsistency: found a gateway for user {} who no longer exists'.format(uid)) gateways_to_delete.append(g['id']) # Fix all inconsistencies found for g in gateways_to_delete: log.error('fixed: terminating orphan gateway container {}'.format(g[:8])) self.swarm.terminate_container(g, delete=True) for n in networks_to_delete: log.error('fixed: terminating orphan network {}'.format(n[:8])) self.swarm.network_remove(n) for u in users_no_network: log.error('fixed: creating network for user {}'.format(u.name)) self.create_user_network(u) for u in users_no_gateway: log.error('fixed: creating gateway for user {}'.format(u.name)) self.start_gateway_container(u) # ### Check executions and container consistency swarm_containers = self.swarm.list(only_label='zoe.{}'.format(get_conf().container_name_prefix)) conts_state_to_delete = [] for c_id, c in self.state_manager.containers.items(): if c.docker_id not in [x['id'] for x in swarm_containers]: log.error('fixed: removing from state container {} that does not exist in Swarm'.format(c.name)) conts_state_to_delete.append(c_id) for c_id in conts_state_to_delete: self.state_manager.delete('container', c_id) if state_changed or len(users_no_gateway) > 0 or len(users_no_network) > 0: self.state_manager.state_updated()