def stop(self): containers = list_containers( self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}'], 'status': 'running'} ) for container in containers: container.stop() self.logger.info(f'Container name={container.name} stopped.')
def _start_mongo_db(self): """Start Mongo DB service. From https://stackoverflow.com/a/53522699/13173608. """ mongo_name = f'mongo-{random.randint(0, 100000)}' self.docker_client.containers.run( 'mongo', ports={'27017/tcp': self.mongo_port}, name=mongo_name, environment={ 'MONGO_INITDB_USERNAME': MONGO_USERNAME, 'MONGO_INITDB_PASSWORD': MONGO_PASSWORD, 'MONGO_INITDB_DATABASE': MONGO_DB, }, labels={**self.common_labels, MODELCI_DOCKER_PORT_LABELS['mongo']: str(self.mongo_port)}, **self.extra_container_kwargs ) time.sleep(1) try: # create MongoDB user client = MongoClient(f'{MONGO_HOST}:{MONGO_PORT}') kwargs = {'pwd': MONGO_PASSWORD, 'roles': [{'role': 'readWrite', 'db': MONGO_DB}]} getattr(client, MONGO_DB).command("createUser", MONGO_USERNAME, **kwargs) except Exception as e: self.logger.error(f'Exception during starting MongoDB: {e}') container = list_containers(self.docker_client, filters={'name': mongo_name})[0] container.kill() container.remove() return check_container_status(self.docker_client, name=mongo_name) self.logger.info(f'Container name={mongo_name} stared')
def _start_gpu_metrics_node_exporter(self): rand_num = random.randint(0, 100000) gpu_metrics_name = f'gpu-metrics-exporter-{rand_num}' dcgm_container = list_containers( self.docker_client, filters={ 'label': [MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']] })[0] # start gpu-metric-exporter self.docker_client.containers.run( 'bgbiao/gpu-metrics-exporter', privileged=True, name=gpu_metrics_name, ports={'9400/tcp': self.node_exporter_port}, volumes_from=[dcgm_container.id], labels={ **self.common_labels, MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']: str(self.node_exporter_port), MODELCI_GPU_LABEL: str(self.enable_gpu), }, **self.extra_container_kwargs) check_container_status(self.docker_client, gpu_metrics_name) self.logger.info(f'{gpu_metrics_name} stared')
def remove_all(self): containers = list_containers( self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']}) for container in containers: container.stop() container.remove() self.logger.info(f'Container {container.id} is removed.')
def start(self): """Start the ModelCI service.""" # remove incorrect containers with different GPU enabled flag self._remove( filters={'label': [f'{MODELCI_GPU_LABEL}={not self.enable_gpu}']}) containers_in_cluster = list_containers( docker_client=self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']}) if containers_in_cluster == 4: self.logger.error( f'Cluster {self.cluster_name} cannot be started because it already exists.' ) return False # download all required docker images self._download_serving_containers() # obtain which containers has started all_labels = dict() for container in containers_in_cluster: all_labels.update(container.labels) if container.attrs['State']['Status'] != 'running': # try start stopped container self.logger.warning( f'Service already exist, found container name={container.name}.' ) container.start() self.logger.info('Service started.') else: self.logger.warning( f'Service with container name={container.name} already started.' ) if not MODELCI_DOCKER_PORT_LABELS['mongo'] in all_labels: self._start_mongo_db() if not MODELCI_DOCKER_PORT_LABELS['cadvisor'] in all_labels: self._start_cadvisor() if not MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter'] in all_labels: self._start_dcgm_node_exporter() if not MODELCI_DOCKER_PORT_LABELS[ 'gpu_metrics_node_exporter'] in all_labels: self._start_gpu_metrics_node_exporter() return self.connect()
def connect(self): """Use the cluster name to update ports. Because they might not match as in start_clipper the ports might be changed. """ containers = list_containers( docker_client=self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']}) all_labels = dict() for container in containers: all_labels.update(container.labels) self.mongo_port = all_labels[MODELCI_DOCKER_PORT_LABELS['mongo']] self.cadvisor_port = all_labels[MODELCI_DOCKER_PORT_LABELS['cadvisor']] self.node_exporter_port = all_labels[MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']] return True
def _remove(self, filters): containers = list_containers(self.docker_client, filters) for container in containers: container.stop() container.remove() self.logger.info(f'Container {container.id} is removed.')