Ejemplo n.º 1
0
def get_utilization_rates(handle):
    try:
        return dict(
            gpu=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
            memory=pynvml.nvmlDeviceGetUtilizationRates(handle).memory,
        )
    except pynvml.NVMLError_Unknown:
        return dict(
            gpu=None,
            memory=None,
        )
Ejemplo n.º 2
0
def real_time():
    return {
        "utilization":
        [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles],
        "memory-used":
        [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles],
    }
Ejemplo n.º 3
0
def admin_system():
    factor = 1073741824
    vmem = psutil.virtual_memory()
    ram = {
        "percent": vmem.percent,
        "used": round(vmem.used / factor, 2),
        "total": round(vmem.total / factor, 2)
    }  # GB

    hdd = psutil.disk_usage(app.config['USERSPACE_FOLDER'])
    disk_usage = {
        "percent": round((hdd.used / hdd.total) * 100, 2),
        "used": round(hdd.used / factor, 2),
        "total": round(hdd.total / factor, 2)
    }  # GB

    gpus = []
    pynvml.nvmlInit()
    for i in range(0, pynvml.nvmlDeviceGetCount()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        resources = pynvml.nvmlDeviceGetUtilizationRates(handle)
        gpus.append({
            "id": i,
            "memory": resources.memory,
            "proc": resources.gpu
        })

    return render_template('system.admin.html.jinja2',
                           page_name='admin_system',
                           page_title='System',
                           ram=ram,
                           cpu=round(psutil.cpu_percent(), 2),
                           gpus=gpus,
                           disk_usage=disk_usage)
Ejemplo n.º 4
0
 def autoselect(gpu_target: List[int], min_memory: float) -> int:
     logging.info(f'GPU search space: {gpu_target}')
     nvmlInit()
     deviceCount = nvmlDeviceGetCount()
     memories = np.zeros((deviceCount, COUNT), dtype=np.float32)
     rates = np.zeros((deviceCount, COUNT), dtype=np.float32)
     for c in range(COUNT):
         for i in range(deviceCount):
             if i not in gpu_target:
                 memories[i, c] = 0
                 rates[i, c] = 100
             else:
                 handle = nvmlDeviceGetHandleByIndex(i)
                 memories[
                     i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3
                 rates[i,
                       c] = int(nvmlDeviceGetUtilizationRates(handle).gpu)
         time.sleep(INTERVAL)
     nvmlShutdown()
     memories = memories.mean(1)
     rates = rates.mean(1)
     # enough memory GPU ids
     memory_enough_ids = np.where(memories > min_memory)[0]
     if len(memory_enough_ids) > 0:
         # min util GPU
         gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])]
         # if multi GPUs' util are the same, choose one that has the most memory
         gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0]
         gpu_min_ids = memory_enough_ids[gpu_min_ids]
         gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])]
         logging.info(f'Auto select GPU {gpuid}')
     else:
         raise MemoryError(str(memories))
     return int(gpuid)
Ejemplo n.º 5
0
    def cb():
        nonlocal last_time
        now = time.time()
        src_dict = {"time": [now * 1000]}
        gpu_tot = 0
        mem_tot = 0
        tx_tot = 0
        rx_tot = 0
        for i in range(ngpus):
            gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
            mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used
            tx = (pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024)
            rx = (pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024)
            gpu_tot += gpu
            mem_tot += mem / (1024 * 1024)
            rx_tot += rx
            tx_tot += tx
            src_dict["gpu-" + str(i)] = [gpu]
            src_dict["memory-" + str(i)] = [mem]
        src_dict["gpu-total"] = [gpu_tot / ngpus]
        src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100]
        src_dict["tx-total"] = [tx_tot]
        src_dict["rx-total"] = [rx_tot]

        source.stream(src_dict, 1000)

        last_time = now
Ejemplo n.º 6
0
    def __query_util(handle):
        """
        Query information on the utilization of a GPU.

        Arguments:
            handle:
                NVML device handle.

        Returns:
            summaries (:obj:`dict`):
                Dictionary containing the memory values for ['mem_util', 'gpu_util'].
                All values are given as integers  in the range (0, 100).
        """
        # Query information on the GPU utilization.
        util = nvml.nvmlDeviceGetUtilizationRates(handle)

        summaries = dict()
        # Percent of time over the past second during which global (device) memory was being
        # read or written.
        summaries['mem_util'] = util.memory
        # Percent of time over the past second during which one or more kernels was executing
        # on the GPU.
        summaries['gpu_util'] = util.gpu

        return summaries
Ejemplo n.º 7
0
def _get_gpu_usage(gpu_count):
    import pynvml
    gpus = []
    for i in range(gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        try:
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            temp = pynvml.nvmlDeviceGetTemperature(handle,
                                                   pynvml.NVML_TEMPERATURE_GPU)
            try:
                power_usage = (
                    pynvml.nvmlDeviceGetPowerUsage(handle) /
                    1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) /
                               1000.0) * 100
            except pynvml.NVMLError as e:
                logger.error(
                    "Coudln't extract power usage due to NVML exception: {}".
                    format(str(e)))
                power_usage = -9999
            gpus.append(
                (handle, util.gpu, util.memory,
                 (memory.used / float(memory.total)) * 100, temp, power_usage))
        except pynvml.NVMLError as e:
            logger.error(
                "Coudln't extract gpu usage information due to NVML exception: {}"
                .format(str(e)))
            return None
    return gpus
Ejemplo n.º 8
0
def avg_gpu_info(measure_duration, print_info=False):
    """
    Input:
        measure_duration: int
    Output:
        avg_free_memory: numpy.array[int], len=gpu_count
        avg_gpu_util: numpy.array[int], len=gpu_count
    """
    # Get average gpu status
    pynvml.nvmlInit()  #初始化
    gpu_count = pynvml.nvmlDeviceGetCount()
    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpu_count)]
    avg_free_memory = [0.0] * gpu_count
    avg_gpu_util = [0.0] * gpu_count
    for _ in range(int(measure_duration)):
        for id, handle in enumerate(handles):
            avg_free_memory[id] = avg_free_memory[
                id] + pynvml.nvmlDeviceGetMemoryInfo(handle).free / 1e6
            avg_gpu_util[id] = avg_gpu_util[
                id] + pynvml.nvmlDeviceGetUtilizationRates(handle).gpu

        time.sleep(1)
    avg_free_memory = np.array(
        [int(memory / measure_duration) for memory in avg_free_memory])
    avg_gpu_util = np.array(
        [int(power / measure_duration) for power in avg_gpu_util])
    if print_info:
        present_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(present_time)
        for gpu_id in range(gpu_count):
            gpu_info = 'GPU%d: gpu util:%d%% | free memory:%dMiB' % (
                gpu_id, avg_gpu_util[gpu_id], avg_free_memory[gpu_id])
            logging.info(gpu_info)
    return avg_free_memory, avg_gpu_util
Ejemplo n.º 9
0
    def gpu_info(self):
        # pip install nvidia-ml-py3
        if len(self.gpu_ids) >= 0 and torch.cuda.is_available():
            try:
                import pynvml
                pynvml.nvmlInit()
                self.config_dic[
                    'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion(
                    )
                for gpu_id in self.gpu_ids:
                    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                    gpu_id_name = "gpu%s" % gpu_id
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    self.config_dic['%s_device_name' %
                                    gpu_id_name] = pynvml.nvmlDeviceGetName(
                                        handle)
                    self.config_dic['%s_mem_total' %
                                    gpu_id_name] = gpu_mem_total = round(
                                        mem_info.total / 1024**3, 2)
                    self.config_dic['%s_mem_used' %
                                    gpu_id_name] = gpu_mem_used = round(
                                        mem_info.used / 1024**3, 2)
                    # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2
                    self.config_dic['%s_mem_percent' % gpu_id_name] = round(
                        (gpu_mem_used / gpu_mem_total) * 100, 1)
                    self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name,
                                          gpu_utilize.gpu, 0.8)
                    # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu
                    # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory

                pynvml.nvmlShutdown()
            except Exception as e:
                print(e)
Ejemplo n.º 10
0
def real_time():
    init_once()
    h = _pynvml_handles()
    return {
        "utilization": pynvml.nvmlDeviceGetUtilizationRates(h).gpu,
        "memory-used": pynvml.nvmlDeviceGetMemoryInfo(h).used,
    }
Ejemplo n.º 11
0
 def load(self):
     if self.__has_gpu:
         l = 0.0
         for i in range(self.__ngpus):
             l += pynvml.nvmlDeviceGetUtilizationRates(self.__handle[i]).gpu
         return (l / self.__ngpus) / 100.0
     return 0.0
Ejemplo n.º 12
0
def autoset_nvgpu(metric="memory", k=1):
    """autoset_nvgpu
    automatically set NVIDIA GPU device

    Args:
        metric (str): memory/utilization
            select the GPU with min(metric)
        k (int): num. of selected devices
    """
    pynvml.nvmlInit()
    gpunum = pynvml.nvmlDeviceGetCount()
    assert (k <= gpunum)
    metric_list = []
    for idx in range(gpunum):
        handle = pynvml.nvmlDeviceGetHandleByIndex(idx)

        if metric in ["util", "utilization"]:
            util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
            metric_list.append((util_rate, idx))
        else:
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            mem_use_rate = 1.0 - mem_info.free / mem_info.total
            metric_list.append((mem_use_rate, idx))
    # sort the devices with ascending metric
    metric_list = sorted(metric_list, key=lambda x: x[0])
    selected_idx = [str(x[1]) for x in metric_list[:k]]
    # set the visible devices
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(selected_idx)
Ejemplo n.º 13
0
def device_status(device_index):
    handle = nv.nvmlDeviceGetHandleByIndex(device_index)
    device_name = nv.nvmlDeviceGetName(handle)
    device_name = device_name.decode('UTF-8')
    nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle)
    utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu
    clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM)
    temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU)
    pids = []
    users = []
    dates = []
    cmd = None
    for nv_proc in nv_procs:
        pid = nv_proc.pid
        pids.append(pid)
        try:
            proc = psutil.Process(pid)
            users.append(proc.username())
            dates.append(proc.create_time())
            if cmd is None:
                cmd = parse_cmd_roughly(proc.cmdline())
        except psutil.NoSuchProcess:
            users.append('?')
    return {
        'type': device_name,
        'is_available': len(pids) == 0,
        'pids': ','.join([str(pid) for pid in pids]),
        'users': ','.join(users),
        'running_since':
        arrow.get(min(dates)).humanize() if len(dates) > 0 else None,
        'utilization': utilization,
        'clock_mhz': clock_mhz,
        'temperature': temperature,
        'cmd': cmd,
    }
Ejemplo n.º 14
0
def utilization(device: Optional[Union[Device, int]] = None) -> int:
    r"""Returns the percent of time over the past sample period during which one or
    more kernels was executing on the GPU as given by `nvidia-smi`.

    Args:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch.cuda.current_device`,
            if :attr:`device` is ``None`` (default).

    Warning: Each sample period may be between 1 second and 1/6 second,
    depending on the product being queried.
    """
    try:
        import pynvml  # type: ignore[import]
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            "pynvml module not found, please install pynvml")
    from pynvml import NVMLError_DriverNotLoaded
    try:
        pynvml.nvmlInit()
    except NVMLError_DriverNotLoaded:
        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?")
    device = _get_device_index(device, optional=True)
    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
    return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
Ejemplo n.º 15
0
def get_gpu_utilization(gpu_idx):
    try:
        handle = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
        util = nv.nvmlDeviceGetUtilizationRates(handle)
    except nv.NVMLError as err:
        util = err
    return util
Ejemplo n.º 16
0
    def _get_gpu_status(self, used_gpu_indexes):
        """ Get the status of the currently used GPUs.

        Args:
            used_gpu_indexes: (list)

        Returns:
            gpu_status: (list)

        """
        gpu_status = list()
        nvmlInit()

        for index in used_gpu_indexes:
            handle = nvmlDeviceGetHandleByIndex(index)
            utilization_rates = nvmlDeviceGetUtilizationRates(handle)
            mem_info = nvmlDeviceGetMemoryInfo(handle)
            mem_usage = mem_info.used / mem_info.total
            status = {
                "index": index,
                "gpu_util": utilization_rates.gpu,
                "mem_usage": mem_usage
            }
            gpu_status.append(status)

        nvmlShutdown()
        return gpu_status
Ejemplo n.º 17
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Ejemplo n.º 18
0
def query_device(index):
    handle = pynvml.nvmlDeviceGetHandleByIndex(index)
    return {
        'index': index,
        'name': pynvml.nvmlDeviceGetName(handle).decode(),
        'utilization': pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        'uuid': pynvml.nvmlDeviceGetUUID(handle).decode(),
    }
Ejemplo n.º 19
0
def _get_gpu_usage(gpu_id):
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)

    gpu_usage = pynvml.nvmlDeviceGetUtilizationRates(handle)

    gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
    gpu_mem_usage = gpu_mem.used / gpu_mem.total if gpu_mem.total else 0
    return gpu_usage.gpu, gpu_mem_usage
Ejemplo n.º 20
0
def get_gpu_util(handle):
    util = -1
    try:
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        util = util.gpu
    except Exception:
        raise
    return util
Ejemplo n.º 21
0
def get_gpu_util(handle):
    util = -1
    try:
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        util = util.gpu
    except Exception:
        raise
    return util
Ejemplo n.º 22
0
def mem_utilization_for(device_handle):
    """
        Percent of time over the past sample period during which global (device) memory was being read or written.
    """
    try:
        return pynvml.nvmlDeviceGetUtilizationRates(device_handle).memory
    except pynvml.NVMLError:
        return None
Ejemplo n.º 23
0
def utilization_for(device_handle):
    """Get GPU device consumption in percent
        Percent of time over the past sample period during which one or more kernels was executing on the GPU.
    """
    try:
        return pynvml.nvmlDeviceGetUtilizationRates(device_handle).gpu
    except pynvml.NVMLError:
        return None
Ejemplo n.º 24
0
def query_gpu(handle: int) -> Dict:
    memory = pynvml.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)

    return {
        "gpu_{}_memory_free".format(handle): int(memory.free),
        "gpu_{}_memory_used".format(handle): int(memory.used),
        "gpu_{}_utilization".format(handle): utilization.gpu,
    }
Ejemplo n.º 25
0
def get_gpu_util(gpu_id=0):
    nv.nvmlInit()

    handle = nv.nvmlDeviceGetHandleByIndex(gpu_id)
    print('AWS DEBUG nvmlDeviceGetHandleByIndex', handle)
    utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu
    print('AWS DEBUG nvmlDeviceGetUtilizationRates.gpu', utilization)

    return utilization
Ejemplo n.º 26
0
 def getFreeRatio(id):
     handle = pynvml.nvmlDeviceGetHandleByIndex(id)
     info = pynvml.nvmlDeviceGetMemoryInfo(handle)
     # print("Memory Total: ",info.total/(1024*1024))
     # print("Memory Free: ",info.free/(1024*1024))
     # print("Memory Used: ",info.used/(1024*1024))
     use = pynvml.nvmlDeviceGetUtilizationRates(handle)
     ratio = 0.5 * (float(use.gpu + float(use.memory)))
     return ratio
Ejemplo n.º 27
0
 def load(self) -> float:
     if self.__has_gpu:
         total_load = 0.0
         for i in range(self.__ngpus):
             with contextlib.suppress(Exception):
                 total_load += pynvml.nvmlDeviceGetUtilizationRates(
                     self.__handle[i]).gpu
         return (total_load / self.__ngpus) / 100.0
     return 0.0
Ejemplo n.º 28
0
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {
                    'gpu': util_gpu,
                    'memory': util_mem
                },
                'memory': {
                    'total': mem_total,
                    'free': mem_free,
                    'used': mem_used
                },
                'temperature': temperature,
                'power': {
                    'draw': power_draw,
                    'limit': power_limit
                }
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Ejemplo n.º 29
0
    def sample_utilization_rates(self, handle: DeviceHandle) -> DeviceUtilizationRates:
        memory = nvmlDeviceGetMemoryInfo(handle)
        total = memory.total / MiB
        used = memory.used / MiB

        utilization = nvmlDeviceGetUtilizationRates(handle)
        utilization_gpu = utilization.gpu
        utilization_memory = utilization.memory
        self.log_debug(f"Sampled utilization rates: {used:.2f} MiB, {utilization_gpu}%, {utilization_memory}%")
        return DeviceUtilizationRates(total, used, utilization_gpu, utilization_memory)
Ejemplo n.º 30
0
 def __getitem__(self, item: int):
     if item >= len(self):
         raise IndexError
     h = nv.nvmlDeviceGetHandleByIndex(item)
     idx = nv.nvmlDeviceGetIndex(h)
     mem = nv.nvmlDeviceGetMemoryInfo(h)
     uti = nv.nvmlDeviceGetUtilizationRates(h)
     return idx, dict(free=Bytes(mem.free),
                      used=Bytes(mem.used),
                      util=Percent(uti.gpu))
Ejemplo n.º 31
0
 def _log_gpu_utilization(self):
     gpu_utilizations = {}
     # Get current GPU utilizations in percent
     for gpu_name, gpu_hdl in self.gpu_handles.items():
         gpu_percentage = nvmlDeviceGetUtilizationRates(handle=gpu_hdl).gpu
         gpu_utilizations[gpu_name] = gpu_percentage
     # log CPU utilization to tensorboard
     self._tb_logger.add_scalars(main_tag='GPUs_utilization_percentage',
                                 tag_scalar_dict=gpu_utilizations,
                                 global_step=time() - self._start_time)
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Ejemplo n.º 33
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Ejemplo n.º 34
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info
Ejemplo n.º 35
0
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%"+"(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Ejemplo n.º 36
0
 def info_refresh(self):
     
     try:
         stat = open("/proc/stat")
         self.statlines = stat.read().splitlines()[1:-1]
         stat.close()
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         for j in self.statlines[i].split()[1:]: #remove cpu#
            self.total[i]+= int(j)
         self.idle[i] = int(self.statlines[i].split()[4])
     
     for i in range(self.corecount):
         if (self.total[i] - self.prev_total[i]) == 0:
             self.prev_idle[i] = self.idle[i]
             self.prev_total[i] = self.total[i]
             break
         
         self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) )
         self.prev_idle[i] = self.idle[i]
         self.prev_total[i] = self.total[i]
         self.idle[i] = 0
         self.total[i] = 0
     
     for i in range(self.deviceCount):
         
         util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i])
         temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU)
         memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i])
         (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i])
         (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i])
         
         mem_total = memInfo.total / 1024 / 1024
         mem_used = memInfo.used / 1024 / 1024
         
         self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu)
         self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100)
         ########
         self.util_history.append(util.gpu)
         self.util_graph.queue_draw()
         
         self.temp_history.append(temp)
         self.temp_graph.queue_draw()
         ########
         self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory)
         self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100)
         
         self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util)
         self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util)
         self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100)
         self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100)
         
         self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total))
         self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total)
         
         self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp)
         if temp > 100:
            temp = 100
         elif temp < 0:
             temp = 0
         self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100)
         
         
     #--proc--
     procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0])
     
     proc_liststore = Gtk.ListStore(int, str, int)
     
     for p in procs:
         pid = p.pid
         try:
             path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8')
         except:
             self.exit()
         if (p.usedGpuMemory == None):
             mem = 0
         else:
             mem = (p.usedGpuMemory / 1024 / 1024)
         proc_liststore.append([pid, path, mem])
     self.tree.set_model(proc_liststore)
     return True
Ejemplo n.º 37
0
def get_proc(device_handle):
    """Get GPU device CPU consumption in percent."""
    try:
        return pynvml.nvmlDeviceGetUtilizationRates(device_handle).gpu
    except pynvml.NVMLError:
        return None
Ejemplo n.º 38
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Ejemplo n.º 39
0
 def getFreeRatio(id):
     handle = pynvml.nvmlDeviceGetHandleByIndex(id)
     use = pynvml.nvmlDeviceGetUtilizationRates(handle)
     ratio = 0.5*(float(use.gpu+float(use.memory)))
     return ratio