def _start_mongo_db(self): """Start Mongo DB service. From https://stackoverflow.com/a/53522699/13173608. """ mongo_name = f'mongo-{random.randint(0, 100000)}' self.docker_client.containers.run( 'mongo', ports={'27017/tcp': self.mongo_port}, name=mongo_name, environment={ 'MONGO_INITDB_USERNAME': MONGO_USERNAME, 'MONGO_INITDB_PASSWORD': MONGO_PASSWORD, 'MONGO_INITDB_DATABASE': MONGO_DB, }, labels={**self.common_labels, MODELCI_DOCKER_PORT_LABELS['mongo']: str(self.mongo_port)}, **self.extra_container_kwargs ) time.sleep(1) try: # create MongoDB user client = MongoClient(f'{MONGO_HOST}:{MONGO_PORT}') kwargs = {'pwd': MONGO_PASSWORD, 'roles': [{'role': 'readWrite', 'db': MONGO_DB}]} getattr(client, MONGO_DB).command("createUser", MONGO_USERNAME, **kwargs) except Exception as e: self.logger.error(f'Exception during starting MongoDB: {e}') container = list_containers(self.docker_client, filters={'name': mongo_name})[0] container.kill() container.remove() return check_container_status(self.docker_client, name=mongo_name) self.logger.info(f'Container name={mongo_name} stared')
def _start_gpu_metrics_node_exporter(self): rand_num = random.randint(0, 100000) gpu_metrics_name = f'gpu-metrics-exporter-{rand_num}' dcgm_container = list_containers( self.docker_client, filters={ 'label': [MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']] })[0] # start gpu-metric-exporter self.docker_client.containers.run( 'bgbiao/gpu-metrics-exporter', privileged=True, name=gpu_metrics_name, ports={'9400/tcp': self.node_exporter_port}, volumes_from=[dcgm_container.id], labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']: str(self.node_exporter_port), MODELCI_GPU_LABEL: str(self.enable_gpu), }, **self.extra_container_kwargs) check_container_status(self.docker_client, gpu_metrics_name) self.logger.info(f'{gpu_metrics_name} stared')
def _start_cadvisor(self): """Start cAdvisor service.""" cadvisor_name = f'cadvisor-{random.randint(0, 100000)}' volumes = { '/': {'bind': '/rootfs', 'mode': 'ro'}, '/var/run': {'bind': '/var/run', 'mode': 'rw'}, '/sys': {'bind': '/sys', 'mode': 'ro'}, '/var/lib/docker': {'bind': '/var/lib/docker', 'mode': 'ro'}, } extra_container_kwargs = self.extra_container_kwargs.copy() if self.enable_gpu: # find libnvidia-ml.so.1 cache_file = Path('/tmp/libnvidia-ml.cache') if cache_file.exists(): with open(cache_file) as f: libnvidia_ml_path = f.read().strip() else: args1 = ('locate', 'libnvidia-ml.so.1') args2 = ('grep', '-v', 'lib32') args3 = ('head', '-1') locate = subprocess.Popen(args1, stdout=subprocess.PIPE) grep = subprocess.Popen(args2, stdin=locate.stdout, stdout=subprocess.PIPE) locate.wait() grep.wait() libnvidia_ml_path = subprocess.check_output( args3, stdin=grep.stdout, universal_newlines=True, text=True ).strip() # save to cache with open(cache_file, 'w') as f: f.write(libnvidia_ml_path) volumes.update({libnvidia_ml_path: {'bind': libnvidia_ml_path}}) extra_container_kwargs.update({'environment': {'LD_LIBRARY_PATH': str(Path(libnvidia_ml_path).parent)}}) self.docker_client.containers.run( 'google/cadvisor:latest', name=cadvisor_name, ports={'8080/tcp': self.cadvisor_port}, privileged=True, volumes=volumes, labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['cadvisor']: str(self.cadvisor_port), MODELCI_GPU_LABEL: str(self.enable_gpu), }, **extra_container_kwargs ) check_container_status(self.docker_client, name=cadvisor_name) self.logger.info(f'Container name={cadvisor_name} started.')
def _start_dcgm_node_exporter(self): """Start node exporter service.""" rand_num = random.randint(0, 100000) dcgm_name = f'dcgm-exporter-{rand_num}' # start dcgm-exporter self.docker_client.containers.run( 'bgbiao/dcgm-exporter', runtime='nvidia', name=dcgm_name, labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']: '-1', MODELCI_GPU_LABEL: str(self.enable_gpu), }, **self.extra_container_kwargs ) check_container_status(self.docker_client, dcgm_name) self.logger.info(f'Container name={dcgm_name} started.')