def preload_image(self, image_name): """Pull an image from a Docker registry into each host. We shuffle the list to prevent the scheduler to find always the first host in the list.""" parsed_name = re.search( r'^(?:([^/]+)/)?(?:([^/]+)/)?([^@:/]+)(?:[@:](.+))?$', image_name) if parsed_name.group(4) is None: raise ZoeException( 'Image {} does not have a version tag'.format(image_name)) one_success = False for host_conf in self.docker_config: log.debug('Pre-loading image {} on host {}'.format( image_name, host_conf.name)) time_start = time.time() my_engine = DockerClient(host_conf) try: my_engine.pull_image(image_name) except ZoeException: log.error('Image {} pre-loading failed on host {}'.format( image_name, host_conf.name)) continue else: one_success = True log.debug('Image {} pre-loaded on host {} in {:.2f}s'.format( image_name, host_conf.name, time.time() - time_start)) if not one_success: raise ZoeException('Cannot pull image {}'.format(image_name))
def list(self, only_label=None) -> List[dict]: """ List running or defined containers. :param only_label: filter containers with only a certain label :return: a list of containers """ try: ret = self.cli.containers.list(all=True) except docker.errors.APIError as ex: raise ZoeException(str(ex)) except requests.exceptions.RequestException as ex: raise ZoeException(str(ex)) if only_label is None: only_label = {} conts = [] for cont_info in ret: match = True for key, value in only_label.items(): if key not in cont_info.attrs['Config']['Labels']: match = False break if cont_info.attrs['Config']['Labels'][key] != value: match = False break if match: conts.append(self._container_summary(cont_info)) return conts
def pull_image(self, image_name): """Pulls an image in the docker engine.""" try: self.cli.images.pull(image_name) except docker.errors.APIError as e: log.error('Cannot download image {}: {}'.format(image_name, e)) raise ZoeException('Cannot download image {}: {}'.format(image_name, e))
def loop(self): """The API loop.""" while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = self.state.executions.select(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: self._reply_ok() zoe_master.preprocessing.execution_submit( self.state, self.scheduler, execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = self.state.executions.select(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: self._reply_ok() zoe_master.preprocessing.execution_terminate( self.scheduler, execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = self.state.executions.select(id=exec_id, only_one=True) if execution is not None: zoe_master.preprocessing.execution_delete(execution) self._reply_ok() elif message['command'] == 'scheduler_stats': try: data = self.scheduler.stats() if self.metrics.current_stats is None: data['platform_stats'] = {} else: data[ 'platform_stats'] = self.metrics.current_stats.serialize( ) except ZoeException as e: log.error(str(e)) self._reply_error(str(e)) else: self._reply_ok(data=data) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') log.debug('API call {} took {:.2f}s'.format( message['command'], time.time() - start_time))
def inspect_container(self, docker_id: str) -> Dict[str, Any]: """Retrieve information about a running container.""" try: cont = self.cli.containers.get(docker_id) except Exception as e: raise ZoeException(str(e)) return self._container_summary(cont)
def __init__(self, docker_config: DockerHostConfig, mock_client=None) -> None: self.name = docker_config.name self.docker_config = docker_config if not docker_config.tls: tls = None else: tls = docker.tls.TLSConfig(client_cert=(docker_config.tls_cert, docker_config.tls_key), verify=docker_config.tls_ca) # Simplify testing if mock_client is not None: self.cli = mock_client return try: self.cli = docker.DockerClient(base_url=docker_config.address, version="auto", tls=tls) except docker.errors.DockerException as e: raise ZoeException( "Cannot connect to Docker host {} at address {}: {}".format( docker_config.name, docker_config.address, str(e)))
def stats(self, docker_id: str, stream: bool): """Retrieves container stats based on resource usage.""" try: cont = self.cli.containers.get(docker_id) except docker.errors.NotFound: raise ZoeException('Container not found') except docker.errors.APIError as e: raise ZoeException('Docker API error: {}'.format(e)) try: return cont.stats(stream=stream) except docker.errors.APIError as e: raise ZoeException('Docker API error: {}'.format(e)) except requests.exceptions.ReadTimeout: raise ZoeException('Read timeout') except ValueError: raise ZoeException('Docker API decoding error')
def _get_backend() -> Union[BaseBackend, None]: """Return the right backend instance by reading the global configuration.""" backend_name = get_conf().backend assert backend_name in ['Kubernetes', 'Swarm', 'DockerEngine'] if backend_name == 'Kubernetes': if KubernetesBackend is None: raise ZoeException( 'The Kubernetes backend requires the pykube module') return KubernetesBackend(get_conf()) elif backend_name == 'DockerEngine': if DockerEngineBackend is None: raise ZoeException( 'The Docker Engine backend requires docker python version >= 2.0.2' ) return DockerEngineBackend(get_conf()) else: log.error('Unknown backend selected') return None
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list( id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call( start_time, message['command'])
def loop(self): """The API loop.""" while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = self.state.execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit( self.state, self.scheduler, execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = self.state.execution_list(id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate( self.scheduler, execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = self.state.execution_list(id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete( self.scheduler, execution) self._reply_ok() elif message['command'] == 'scheduler_stats': data = self.scheduler.stats() self._reply_ok(data=data) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') self.metrics.metric_api_call(start_time, message['command'])
def list(self, only_label=None, status=None) -> List[dict]: """ List running or defined containers. :param only_label: filter containers with only a certain label :param status: filter containers with only a certain status (one of restarting, running, paused, exited) :return: a list of containers """ filters = {} if only_label is not None: filters['label'] = only_label if status is not None: filters['status'] = status try: ret = self.cli.containers.list(all=True, filters=filters) except docker.errors.APIError as ex: raise ZoeException(str(ex)) except requests.exceptions.RequestException as ex: raise ZoeException(str(ex)) conts = [] for cont_info in ret: conts.append(self._container_summary(cont_info)) return conts
def __init__(self) -> None: url = get_conf().backend_swarm_url tls = False if 'zk://' in url: if KazooClient is None: raise ZoeException('ZooKeeper URL for Swarm, but the kazoo package is not installed') url = url[len('zk://'):] manager = zookeeper_swarm(url, get_conf().backend_swarm_zk_path) elif 'consul://' in url: if Consul is None: raise ZoeException('Consul URL for Swarm, but the consul package is not installed') url = url[len('consul://'):] manager = consul_swarm(url) elif 'http://' in url: manager = url elif 'https://' in url: tls = docker.tls.TLSConfig(client_cert=(get_conf().backend_swarm_tls_cert, get_conf().backend_swarm_tls_key), verify=get_conf().backend_swarm_tls_ca) manager = url else: raise ZoeException('Unsupported URL scheme for Swarm') try: self.cli = docker.DockerClient(base_url=manager, version="auto", tls=tls) except docker.errors.DockerException: raise ZoeException("Cannot connect to Docker")
def spawn_container(self, service_instance: ServiceInstance) -> Dict[str, Any]: """Create and start a new container.""" run_args = { 'detach': True, 'ports': {}, 'environment': {}, 'volumes': {}, 'working_dir': service_instance.work_dir, 'mem_limit': 0, 'mem_reservation': 0, 'memswap_limit': 0, 'name': service_instance.name, 'network_disabled': False, 'network_mode': get_conf().overlay_network_name, 'image': service_instance.image_name, 'command': service_instance.command, 'hostname': service_instance.hostname, 'labels': service_instance.labels, 'cpu_period': 100000, 'cpu_quota': 100000, 'log_config': { "type": "json-file", "config": {} } } for port in service_instance.ports: run_args['ports'][str(port.number) + '/' + port.proto] = None for name, value in service_instance.environment: run_args['environment'][name] = value for volume in service_instance.volumes: if volume.type == "host_directory": assert isinstance(volume, VolumeDescriptionHostPath) run_args['volumes'][volume.path] = {'bind': volume.mount_point, 'mode': ("ro" if volume.readonly else "rw")} else: log.error('Swarm backend does not support volume type {}'.format(volume.type)) if service_instance.memory_limit is not None: run_args['mem_limit'] = service_instance.memory_limit.max run_args['mem_reservation'] = service_instance.memory_limit.min if service_instance.memory_limit.max == service_instance.memory_limit.min: run_args['mem_reservation'] -= 1 if service_instance.core_limit is not None: run_args['cpu_quota'] = int(100000 * service_instance.core_limit.min) if get_conf().gelf_address != '': run_args['log_config'] = { "type": "gelf", "config": { 'gelf-address': get_conf().gelf_address, 'labels': ",".join(service_instance.labels) } } cont = None try: cont = self.cli.containers.run(**run_args) except docker.errors.ImageNotFound: raise ZoeException(message='Image not found') except docker.errors.APIError as e: if cont is not None: cont.remove(force=True) if e.explanation == b'no resources available to schedule container': raise ZoeNotEnoughResourcesException(message=str(e)) else: raise ZoeException(message=str(e)) except Exception as e: if cont is not None: cont.remove(force=True) raise ZoeException(str(e)) cont = self.cli.containers.get(cont.id) return self._container_summary(cont)
def spawn_container(self, service_instance: ServiceInstance) -> Dict[str, Any]: """Create and start a new container.""" cont = None port_bindings = {} # type: Dict[str, Any] for port in service_instance.ports: port_bindings[str(port.number) + '/' + port.proto] = None environment = {} for name, value in service_instance.environment: environment[name] = value volumes = {} for volume in service_instance.volumes: if volume.type == "host_directory": assert isinstance(volume, VolumeDescriptionHostPath) volumes[volume.path] = {'bind': volume.mount_point, 'mode': ("ro" if volume.readonly else "rw")} else: log.error('Swarm backend does not support volume type {}'.format(volume.type)) if service_instance.memory_limit is not None: mem_limit = service_instance.memory_limit.max else: mem_limit = 0 # Swarm backend does not support cores in a consistent way, see https://github.com/docker/swarm/issues/475 if get_conf().gelf_address != '': log_config = { "type": "gelf", "config": { 'gelf-address': get_conf().gelf_address, 'labels': ",".join(service_instance.labels) } } else: log_config = { "type": "json-file", "config": {} } try: cont = self.cli.containers.run(image=service_instance.image_name, command=service_instance.command, detach=True, environment=environment, hostname=service_instance.hostname, labels=service_instance.labels, log_config=log_config, mem_limit=mem_limit, memswap_limit=0, name=service_instance.name, network_disabled=False, network_mode=get_conf().overlay_network_name, ports=port_bindings, working_dir=service_instance.work_dir, volumes=volumes) except docker.errors.ImageNotFound: raise ZoeException(message='Image not found') except docker.errors.APIError as e: if cont is not None: cont.remove(force=True) if e.explanation == b'no resources available to schedule container': raise ZoeNotEnoughResourcesException(message=str(e)) else: raise ZoeException(message=str(e)) except Exception as e: if cont is not None: cont.remove(force=True) raise ZoeException(str(e)) cont = self.cli.containers.get(cont.id) return self._container_summary(cont)