def __init__( self, cluster_name='default-cluster', mongo_ip=service_settings.mongo_host, mongo_port=service_settings.mongo_port, cadvisor_port=service_settings.cadvisor_port, node_exporter_port=service_settings.node_exporter_port, docker_network='modelci_network', extra_container_kwargs=None, enable_gpu=False, ): self.cluster_name = cluster_name self.mongo_ip = mongo_ip self.mongo_port = mongo_port self.cadvisor_port = cadvisor_port self.node_exporter_port = node_exporter_port self.docker_network = docker_network self.enable_gpu = enable_gpu self.docker_client = docker.from_env() if extra_container_kwargs is None: self.extra_container_kwargs = {} else: self.extra_container_kwargs = extra_container_kwargs.copy() # Merge Clipper-specific labels with any user-provided labels if 'labels' in self.extra_container_kwargs: self.common_labels = self.extra_container_kwargs.pop('labels') self.common_labels.update( {MODELCI_DOCKER_LABEL: self.cluster_name}) else: self.common_labels = {MODELCI_DOCKER_LABEL: self.cluster_name} container_kwargs = {'detach': True} self.extra_container_kwargs.update(container_kwargs) # create logger self.logger = Logger('ml-modelci Docker Container Manager', welcome=False)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. import click from modelci.app import (start as app_start, stop as app_stop) from modelci.utils import Logger from modelci.utils.docker_container_manager import DockerContainerManager logger = Logger(__name__, welcome=False) @click.group() def service(): pass @service.command("init") @click.option('--gpu', default=False, type=click.BOOL, is_flag=True) def start_leader_server(gpu=False): """start the system on a leader server in a cluster. initialize necessary services such as database and monitor Args: gpu (bool, optional): [description]. Defaults to False.
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. import click from modelci.app import (start as app_start, stop as app_stop) from modelci.utils import Logger from modelci.utils.docker_container_manager import DockerContainerManager logger = Logger(__name__) @click.group() def service(): pass @service.command("init") @click.option('--gpu', default=False, type=click.BOOL, is_flag=True) def start_leader_server(gpu=False): """start the system on a leader server in a cluster. initialize necessary services such as database and monitor Args: gpu (bool, optional): [description]. Defaults to False.
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. import shutil from pathlib import Path from modelci.utils import Logger logger = Logger('converter', welcome=False) class TFSConverter(object): supported_framework = ["tensorflow"] @staticmethod def from_tensorflow(model, save_path: Path, override: bool = False): import tensorflow as tf if save_path.with_suffix('.zip').exists(): if not override: # file exist yet override flag is not set logger.info('Use cached model') return True tf.compat.v1.saved_model.save(model, str(save_path))
class DockerContainerManager(object): def __init__( self, cluster_name='default-cluster', mongo_ip=MONGO_HOST, mongo_port=MONGO_PORT, cadvisor_port=CADVISOR_PORT, node_exporter_port=NODE_EXPORTER_PORT, docker_network='modelci_network', extra_container_kwargs=None, enable_gpu=False, ): self.cluster_name = cluster_name self.mongo_ip = mongo_ip self.mongo_port = mongo_port self.cadvisor_port = cadvisor_port self.node_exporter_port = node_exporter_port self.docker_network = docker_network self.enable_gpu = enable_gpu self.docker_client = docker.from_env() if extra_container_kwargs is None: self.extra_container_kwargs = {} else: self.extra_container_kwargs = extra_container_kwargs.copy() # Merge Clipper-specific labels with any user-provided labels if 'labels' in self.extra_container_kwargs: self.common_labels = self.extra_container_kwargs.pop('labels') self.common_labels.update({MODELCI_DOCKER_LABEL: self.cluster_name}) else: self.common_labels = {MODELCI_DOCKER_LABEL: self.cluster_name} container_kwargs = {'detach': True} self.extra_container_kwargs.update(container_kwargs) # create logger self.logger = Logger('ml-modelci Docker Container Manager', welcome=False) def start(self): """Start the ModelCI service.""" # remove incorrect containers with different GPU enabled flag self._remove(filters={'label': [f'{MODELCI_GPU_LABEL}={not self.enable_gpu}']}) containers_in_cluster = list_containers( docker_client=self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']} ) if containers_in_cluster == 4: self.logger.error(f'Cluster {self.cluster_name} cannot be started because it already exists.') return False # download all required docker images self._download_serving_containers() # obtain which containers has started all_labels = dict() for container in containers_in_cluster: all_labels.update(container.labels) if container.attrs['State']['Status'] != 'running': # try start stopped container self.logger.warning(f'Service already exist, found container name={container.name}.') container.start() self.logger.info('Service started.') else: self.logger.warning(f'Service with container name={container.name} already started.') if not MODELCI_DOCKER_PORT_LABELS['mongo'] in all_labels: self._start_mongo_db() if not MODELCI_DOCKER_PORT_LABELS['cadvisor'] in all_labels: self._start_cadvisor() if not MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter'] in all_labels: self._start_dcgm_node_exporter() if not MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter'] in all_labels: self._start_gpu_metrics_node_exporter() return self.connect() def connect(self): """Use the cluster name to update ports. Because they might not match as in start_clipper the ports might be changed. """ containers = list_containers( docker_client=self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']}) all_labels = dict() for container in containers: all_labels.update(container.labels) self.mongo_port = all_labels[MODELCI_DOCKER_PORT_LABELS['mongo']] self.cadvisor_port = all_labels[MODELCI_DOCKER_PORT_LABELS['cadvisor']] self.node_exporter_port = all_labels[MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']] return True def stop(self): containers = list_containers( self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}'], 'status': 'running'} ) for container in containers: container.stop() self.logger.info(f'Container name={container.name} stopped.') def remove_all(self): containers = list_containers( self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']} ) for container in containers: container.stop() container.remove() self.logger.info(f'Container {container.id} is removed.') def _remove(self, filters): containers = list_containers(self.docker_client, filters) for container in containers: container.stop() container.remove() self.logger.info(f'Container {container.id} is removed.') def _download_serving_containers(self): images = [ 'mlmodelci/pytorch-serving:latest', 'mlmodelci/onnx-serving:latest', 'tensorflow/serving:2.1.0', 'mongo:latest', 'google/cadvisor:latest', 'bgbiao/dcgm-exporter:latest', 'bgbiao/gpu-metrics-exporter:latest', ] if self.enable_gpu: images.extend([ 'mlmodelci/pytorch-serving:latest-gpu', 'mlmodelci/onnx-serving:latest-gpu', 'tensorflow/serving:2.1.0-gpu', 'nvcr.io/nvidia/tensorrtserver:19.10-py3', ]) for image in images: get_image(self.docker_client, image, self.logger) def _start_mongo_db(self): """Start Mongo DB service. From https://stackoverflow.com/a/53522699/13173608. """ mongo_name = f'mongo-{random.randint(0, 100000)}' self.docker_client.containers.run( 'mongo', ports={'27017/tcp': self.mongo_port}, name=mongo_name, environment={ 'MONGO_INITDB_USERNAME': MONGO_USERNAME, 'MONGO_INITDB_PASSWORD': MONGO_PASSWORD, 'MONGO_INITDB_DATABASE': MONGO_DB, }, labels={**self.common_labels, MODELCI_DOCKER_PORT_LABELS['mongo']: str(self.mongo_port)}, **self.extra_container_kwargs ) time.sleep(1) try: # create MongoDB user client = MongoClient(f'{MONGO_HOST}:{MONGO_PORT}') kwargs = {'pwd': MONGO_PASSWORD, 'roles': [{'role': 'readWrite', 'db': MONGO_DB}]} getattr(client, MONGO_DB).command("createUser", MONGO_USERNAME, **kwargs) except Exception as e: self.logger.error(f'Exception during starting MongoDB: {e}') container = list_containers(self.docker_client, filters={'name': mongo_name})[0] container.kill() container.remove() return check_container_status(self.docker_client, name=mongo_name) self.logger.info(f'Container name={mongo_name} stared') def _start_cadvisor(self): """Start cAdvisor service.""" cadvisor_name = f'cadvisor-{random.randint(0, 100000)}' volumes = { '/': {'bind': '/rootfs', 'mode': 'ro'}, '/var/run': {'bind': '/var/run', 'mode': 'rw'}, '/sys': {'bind': '/sys', 'mode': 'ro'}, '/var/lib/docker': {'bind': '/var/lib/docker', 'mode': 'ro'}, } extra_container_kwargs = self.extra_container_kwargs.copy() if self.enable_gpu: # find libnvidia-ml.so.1 cache_file = Path('/tmp/libnvidia-ml.cache') if cache_file.exists(): with open(cache_file) as f: libnvidia_ml_path = f.read().strip() else: args1 = ('locate', 'libnvidia-ml.so.1') args2 = ('grep', '-v', 'lib32') args3 = ('head', '-1') locate = subprocess.Popen(args1, stdout=subprocess.PIPE) grep = subprocess.Popen(args2, stdin=locate.stdout, stdout=subprocess.PIPE) locate.wait() grep.wait() libnvidia_ml_path = subprocess.check_output( args3, stdin=grep.stdout, universal_newlines=True, text=True ).strip() # save to cache with open(cache_file, 'w') as f: f.write(libnvidia_ml_path) volumes.update({libnvidia_ml_path: {'bind': libnvidia_ml_path}}) extra_container_kwargs.update({'environment': {'LD_LIBRARY_PATH': str(Path(libnvidia_ml_path).parent)}}) self.docker_client.containers.run( 'google/cadvisor:latest', name=cadvisor_name, ports={'8080/tcp': self.cadvisor_port}, privileged=True, volumes=volumes, labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['cadvisor']: str(self.cadvisor_port), MODELCI_GPU_LABEL: str(self.enable_gpu), }, **extra_container_kwargs ) check_container_status(self.docker_client, name=cadvisor_name) self.logger.info(f'Container name={cadvisor_name} started.') def _start_dcgm_node_exporter(self): """Start node exporter service.""" rand_num = random.randint(0, 100000) dcgm_name = f'dcgm-exporter-{rand_num}' # start dcgm-exporter self.docker_client.containers.run( 'bgbiao/dcgm-exporter', runtime='nvidia', name=dcgm_name, labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']: '-1', MODELCI_GPU_LABEL: str(self.enable_gpu), }, **self.extra_container_kwargs ) check_container_status(self.docker_client, dcgm_name) self.logger.info(f'Container name={dcgm_name} started.') def _start_gpu_metrics_node_exporter(self): rand_num = random.randint(0, 100000) gpu_metrics_name = f'gpu-metrics-exporter-{rand_num}' dcgm_container = list_containers( self.docker_client, filters={'label': [MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']]} )[0] # start gpu-metric-exporter self.docker_client.containers.run( 'bgbiao/gpu-metrics-exporter', privileged=True, name=gpu_metrics_name, ports={'9400/tcp': self.node_exporter_port}, volumes_from=[dcgm_container.id], labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']: str(self.node_exporter_port), MODELCI_GPU_LABEL: str(self.enable_gpu), }, **self.extra_container_kwargs ) check_container_status(self.docker_client, gpu_metrics_name) self.logger.info(f'{gpu_metrics_name} stared')
""" Author: Li Yuanming Email: [email protected] Date: 6/19/2020 """ import os import signal import subprocess import sys from pathlib import Path from modelci.config import app_settings from modelci.utils import Logger from modelci.utils.misc import check_process_running logger = Logger('modelci backend', welcome=False) def start(): """Run a ModelCI backend server with Uvicorn.""" # check if the process is running pid = check_process_running(app_settings.server_port) if not pid: args = [ sys.executable, f'{Path(__file__).absolute().parent / "main.py"}', '&>', f'{Path.home()}/tmp/test.log', '&' ] backend_process = subprocess.Popen(args, preexec_fn=os.setsid, close_fds=True) logger.info(