def get_docker_containers(user_list=None, host_namespace=''): """ Get the list of running Docker containers, as `DockerContainer` objects. This is basically polling. Ideally, we should subscribe to Docker events so we can keep the containers list up to date without having to poll like this. :param host_namespace: string representing the host name (e.g. host IP) :param user_list: list of Docker container IDs. `None` means all containers. :return: a list of DockerContainer objects """ for inspect in exec_dockerps(): long_id = inspect['Id'] if user_list not in ['ALL', 'all', 'All', None]: user_ctrs = [cid[:12] for cid in user_list.split(',')] short_id = long_id[:12] if short_id not in user_ctrs: continue try: c = DockerContainer(long_id, inspect=inspect, host_namespace=host_namespace) if c.namespace: yield c except ContainerInvalidEnvironment as e: logger.exception(e)
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': { 'gpu': util_gpu, 'memory': util_mem }, 'memory': { 'total': mem_total, 'free': mem_free, 'used': mem_used }, 'temperature': temperature, 'power': { 'draw': power_draw, 'limit': power_limit } } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def crawl(self, **kwargs): logger.debug('Crawling %s' % (self.get_feature())) for inspect in exec_dockerps(): yield (inspect['Id'], DockerPSFeature._make([ inspect['State']['Running'], 0, inspect['Image'], [], inspect['Config']['Cmd'], inspect['Name'], inspect['Id'], ]), 'dockerps')
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def test_dockerps(self): for inspect in exec_dockerps(): c_long_id = inspect['Id'] break # there should only be one container anyway assert self.container['Id'] == c_long_id