Esempio n. 1
0
def select_gpu():
    """
    Finding the gpu number with min used memory.

    Args:
        None

    Returns:
        GPU number with min used memory (or with max free memory). string
    """
    import pynvml
    pynvml.nvmlInit()

    gpu_count = pynvml.nvmlDeviceGetCount()  # number of gpu
    gpu_devices = list(range(gpu_count))  # serial number of gpu devices

    # Select GPU with min used memory
    max_memo = 24 * 1024 * 1024 * 1024
    gpu_selected = gpu_devices[0]
    for i in range(len(gpu_devices)):
        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_devices[i])
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        if meminfo.used <= max_memo:
            max_memo = meminfo.used
            gpu_selected = gpu_devices[i]

    return str(gpu_selected)
Esempio n. 2
0
def get_statistics():
    """Get statistics for each GPU installed in the system."""
    nvmlInit()
    statistics = []

    try:
        count = nvmlDeviceGetCount()
        for i in range(count):
            handle = nvmlDeviceGetHandleByIndex(i)

            memory = nvmlDeviceGetMemoryInfo(handle)

            statistics.append({
                "gpu": i,
                "name": nvmlDeviceGetName(handle).decode("utf-8"),
                "memory": {
                    "total": _convert_kb_to_gb(int(memory.total)),
                    "used": _convert_kb_to_gb(int(memory.used)),
                    "utilisation": int(memory.used / memory.total * 100)
                },
            })
    except NVMLError as error:
        print(error)

    return statistics
Esempio n. 3
0
def delay4gpus(delay, gpu_list):
    if isinstance(delay, bool):
        if delay:
            import pynvml
            import time
            pynvml.nvmlInit()
            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_list[0])
            while True:
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                usage = memory.used / memory.total
                if usage < 0.2:
                    break
                else:
                    print('GPU-%d is in use %.2f, still waiting' %
                          (gpu_list[0], usage))
                time.sleep(60)
    elif isinstance(delay, int) or isinstance(delay, float):
        import time
        delay = int(delay)
        for minute in tqdm(range(delay),
                           desc='Wait:',
                           leave=False,
                           smoothing=0.1):
            time.sleep(60)
    else:
        raise NotImplementedError('Wrong delay type')
Esempio n. 4
0
 def _get_gpu(self, update: Update, context: CallbackContext):
     print(update.message.from_user.username, "requested gpu usage")
     pynvml.nvmlInit()
     handle = pynvml.nvmlDeviceGetHandleByIndex(0)
     info = pynvml.nvmlDeviceGetMemoryInfo(handle)
     update.message.reply_text(get_usage_msg(info),
                               parse_mode=telegram.ParseMode.MARKDOWN)
Esempio n. 5
0
def real_time():
    return {
        "utilization":
        [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles],
        "memory-used":
        [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles],
    }
Esempio n. 6
0
    def gpu_info(self):
        # pip install nvidia-ml-py3
        if len(self.gpu_ids) >= 0 and torch.cuda.is_available():
            try:
                import pynvml
                pynvml.nvmlInit()
                self.config_dic[
                    'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion(
                    )
                for gpu_id in self.gpu_ids:
                    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                    gpu_id_name = "gpu%s" % gpu_id
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    self.config_dic['%s_device_name' %
                                    gpu_id_name] = pynvml.nvmlDeviceGetName(
                                        handle)
                    self.config_dic['%s_mem_total' %
                                    gpu_id_name] = gpu_mem_total = round(
                                        mem_info.total / 1024**3, 2)
                    self.config_dic['%s_mem_used' %
                                    gpu_id_name] = gpu_mem_used = round(
                                        mem_info.used / 1024**3, 2)
                    # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2
                    self.config_dic['%s_mem_percent' % gpu_id_name] = round(
                        (gpu_mem_used / gpu_mem_total) * 100, 1)
                    self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name,
                                          gpu_utilize.gpu, 0.8)
                    # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu
                    # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory

                pynvml.nvmlShutdown()
            except Exception as e:
                print(e)
Esempio n. 7
0
 def print_ram_info(self):
     gpu_total = gpu_free = cpu_free = gc_free = 0
     self._ph()
     try:
         gc_free = gc.collect()
         torch.cuda.empty_cache()  # @UndefinedVariable
         val = psutil.virtual_memory()._asdict()
         cpu_free = round((val["available"] / (1024**3)), 2)
         self._pp("Free CPU RAM", str(cpu_free) + " GB")
         #
         pynvml.nvmlInit()
         handle = pynvml.nvmlDeviceGetHandleByIndex(0)
         info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         gpu_free = round(info.free / (1024**3), 2)
         self._pp("Free GPU RAM", str(gpu_free) + " GB")
         #
         gpu_total = round(info.total / (1024**3), 2)
         self._pp("Total GPU RAM", str(gpu_free) + " GB")
         self._pp("Garbage Collection", gc_free)
     except:
         self._pp("**Error", "NO GPU accelerator")
         self._pp(
             "Suggest recovery",
             "Menu > Runtime > Change Runtime Type > {select} GPU accelerator"
         )
     self._ph()
     return
Esempio n. 8
0
 def autoselect(gpu_target: List[int], min_memory: float) -> int:
     logging.info(f'GPU search space: {gpu_target}')
     nvmlInit()
     deviceCount = nvmlDeviceGetCount()
     memories = np.zeros((deviceCount, COUNT), dtype=np.float32)
     rates = np.zeros((deviceCount, COUNT), dtype=np.float32)
     for c in range(COUNT):
         for i in range(deviceCount):
             if i not in gpu_target:
                 memories[i, c] = 0
                 rates[i, c] = 100
             else:
                 handle = nvmlDeviceGetHandleByIndex(i)
                 memories[
                     i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3
                 rates[i,
                       c] = int(nvmlDeviceGetUtilizationRates(handle).gpu)
         time.sleep(INTERVAL)
     nvmlShutdown()
     memories = memories.mean(1)
     rates = rates.mean(1)
     # enough memory GPU ids
     memory_enough_ids = np.where(memories > min_memory)[0]
     if len(memory_enough_ids) > 0:
         # min util GPU
         gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])]
         # if multi GPUs' util are the same, choose one that has the most memory
         gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0]
         gpu_min_ids = memory_enough_ids[gpu_min_ids]
         gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])]
         logging.info(f'Auto select GPU {gpuid}')
     else:
         raise MemoryError(str(memories))
     return int(gpuid)
Esempio n. 9
0
def prepare_net(net, use_gpu=True):

    handle = None
    device = 'cpu'
    if not use_gpu:
        print('Running on CPUs')
        return net, device, handle

    if torch.cuda.is_available():
        device = 'cuda'

    if device != 'cpu':
        import pynvml
        import torch.backends.cudnn as cudnn

        print('Running on GPU')
        net = net.to(device)
        # net = torch.nn.DataParallel(net)
        cudnn.benchmark = True
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        device_name = pynvml.nvmlDeviceGetName(handle).decode("utf-8")
        print("CUDA Device: {} | RAM: {:.4g}G".format(
            device_name, mem_info.total / (2**30)))
    else:
        print('No CUDA devices available, run on CPUs')

    return net, device, handle
Esempio n. 10
0
def gpu_mem_used_get():
    "query nvidia for used memory for gpu in MBs (rounded down). If id is not passed, currently selected torch device is used. Clears pytorch cache before taking the measurements"
    torch.cuda.empty_cache()  # clear cache to report the correct data
    id = torch.cuda.current_device()
    handle = pynvml.nvmlDeviceGetHandleByIndex(id)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return int(info.used / 2**20)
Esempio n. 11
0
def get_mem(device_handle):
    """Get GPU device memory consumption in percent."""
    try:
        memory_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        return memory_info.used * 100.0 / memory_info.total
    except pynvml.NVMLError:
        return None
Esempio n. 12
0
def get_gpu_memory(gpu_idx):
    try:
        handle = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
        mem = nv.nvmlDeviceGetMemoryInfo(handle)
    except nv.NVMLError as err:
        mem = err
    return mem
Esempio n. 13
0
def real_time():
    init_once()
    h = _pynvml_handles()
    return {
        "utilization": pynvml.nvmlDeviceGetUtilizationRates(h).gpu,
        "memory-used": pynvml.nvmlDeviceGetMemoryInfo(h).used,
    }
Esempio n. 14
0
 def get(index):
     try:
         handle = pynvml.nvmlDeviceGetHandleByIndex(index)
     except pynvml.NVMLError_GpuIsLost:
         return None
     memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
     return dict(
         nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'),
         nvmlDeviceGetMemoryInfo=dict(
             total=memory_info.total,
             free=memory_info.free,
             used=memory_info.used,
         ),
         nvmlDeviceGetUtilizationRates=get_utilization_rates(handle),
         nvmlDeviceGetFanSpeed=get_fan_speed(handle),
         nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature(
             handle, pynvml.NVML_TEMPERATURE_GPU),
         nvmlDeviceGetTemperatureThreshold=dict(
             slowdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                 handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN),
             shutdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                 handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
         ),
         nvmlDeviceGetPowerManagementLimit=pynvml.
         nvmlDeviceGetPowerManagementLimit(handle),
         nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle),
     )
Esempio n. 15
0
def getGPUstate():
    """
    pip install nvidia-ml-py3
    :return:返回一个数组,数组长度为GPU的个数
    """
    meminfo = {}
    infoStr = ""
    try:
        pynvml.nvmlInit()
        devicecount = pynvml.nvmlDeviceGetCount()
        for num in range(devicecount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(num)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            meminfo[
                num] = "Device: {} , {} / {} {:.2f}%, free memory:{}".format(
                    num, info.used, info.total, info.used / info.total * 100,
                    info.free)
        for i in range(len(meminfo)):
            infoStr += meminfo[i] + "\n"
        # mainlog(infoStr,'info')
        return infoStr
    except Exception as e:
        #mainlog(e, 'error')
        # print("error happen in getGPUstate:"+str(e))
        return "出现错误 Error:" + str(e)
Esempio n. 16
0
def get_device_total_memory(index=0):
    """
    Return total memory of CUDA device with index
    """
    pynvml.nvmlInit()
    return pynvml.nvmlDeviceGetMemoryInfo(
        pynvml.nvmlDeviceGetHandleByIndex(index)).total
Esempio n. 17
0
def avg_gpu_info(measure_duration, print_info=False):
    """
    Input:
        measure_duration: int
    Output:
        avg_free_memory: numpy.array[int], len=gpu_count
        avg_gpu_util: numpy.array[int], len=gpu_count
    """
    # Get average gpu status
    pynvml.nvmlInit()  #初始化
    gpu_count = pynvml.nvmlDeviceGetCount()
    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpu_count)]
    avg_free_memory = [0.0] * gpu_count
    avg_gpu_util = [0.0] * gpu_count
    for _ in range(int(measure_duration)):
        for id, handle in enumerate(handles):
            avg_free_memory[id] = avg_free_memory[
                id] + pynvml.nvmlDeviceGetMemoryInfo(handle).free / 1e6
            avg_gpu_util[id] = avg_gpu_util[
                id] + pynvml.nvmlDeviceGetUtilizationRates(handle).gpu

        time.sleep(1)
    avg_free_memory = np.array(
        [int(memory / measure_duration) for memory in avg_free_memory])
    avg_gpu_util = np.array(
        [int(power / measure_duration) for power in avg_gpu_util])
    if print_info:
        present_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(present_time)
        for gpu_id in range(gpu_count):
            gpu_info = 'GPU%d: gpu util:%d%% | free memory:%dMiB' % (
                gpu_id, avg_gpu_util[gpu_id], avg_free_memory[gpu_id])
            logging.info(gpu_info)
    return avg_free_memory, avg_gpu_util
Esempio n. 18
0
def watch_gpu(k):
    import pynvml
    import time
    import torch
    pynvml.nvmlInit()
    gpu_num = [0, 1, 2, 3, 4, 5, 6, 7]
    while True:
        for i in gpu_num:
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            free = meminfo.free / 1024**2

            if free >= 2000:
                print("第%s号卡存在剩余空间: " % i, free)
                os.environ['CUDA_VISIBLE_DEVICES'] = str(i)
                # a = torch.rand([1,3,500,500])
                from models.common import GPUModel
                model = GPUModel(120 * k)
                model.cuda()
                print_here = True
                while True:
                    if print_here:
                        print("已经完成")
                        print_here = False

            time.sleep(1)
Esempio n. 19
0
def gpus_available() -> dict:
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices is None:
            visible_devices = list(range(nvmlDeviceGetCount()))
        else:
            visible_devices = {int(x.strip()) for x in visible_devices.split(',')}
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True))
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return {}
Esempio n. 20
0
def get_available_device(args=[], init=True):
    """Convenience function that gets available GPU units and returns a string
	on the pattern f"/GPU:{i}" telling the index of the one currently using
	the lowest memory. Also sets the environment variable 'CUDA_VISIBLE_DEVICES'
	to f'{i}'.
	
	If there is an NVMLError in the attempt to get this information,
	`i` defaults to a pre-selected unit.
	
	If `args` has len > 1, the second argument is the integer index of the GPU.
	"""
    #os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
    if init: nv.nvmlInit()
    if len(args) <= 1:
        try:
            devices = map(nv.nvmlDeviceGetHandleByIndex,
                          range(nv.nvmlDeviceGetCount()))
            #ind,device
            devices_enum = sorted(
                enumerate(devices),
                key=lambda d: nv.nvmlDeviceGetMemoryInfo(d[1]).free,
                reverse=True)
            ind = devices_enum[0][0]
        except nv.NVMLError as e:
            print(e)
            print(">  >  > error occurred: defaulting to gpu3")
            ind = 3
    else:
        ind = args[1]

    print('*\t*\t*\t*\t*\t*\t*\tget_available_device(): using device', ind)

    os.environ['CUDA_VISIBLE_DEVICES'] = f'{ind}'

    return '/GPU:%i' % ind
Esempio n. 21
0
def auto_select_gpu():
    """Select gpu which has largest free memory"""
    if HAS_NVML:
        pynvml.nvmlInit()
        deviceCount = pynvml.nvmlDeviceGetCount()
        largest_free_mem = 0
        largest_free_idx = 0
        for i in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            if info.free > largest_free_mem:
                largest_free_mem = info.free
                largest_free_idx = i
        pynvml.nvmlShutdown()
        largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

        idx_to_gpu_id = {}
        for i in range(deviceCount):
            idx_to_gpu_id[i] = '{}'.format(i)

        gpu_id = idx_to_gpu_id[largest_free_idx]
        logging.info(
            'Using largest free memory GPU {} with free memory {}MB'.format(
                gpu_id, largest_free_mem))
        return gpu_id
    else:
        logging.info(
            'nvidia-ml-py is not installed, automatically select gpu is disabled!'
        )
        return '0'
Esempio n. 22
0
    def _get_vram(self):
        """ Obtain the total VRAM in Megabytes for each connected GPU.

        Returns
        -------
        list
             List of floats containing the total amount of VRAM in Megabytes for each connected GPU
             as corresponding to the values in :attr:`_handles
        """
        self._initialize()
        if self._device_count == 0:
            vram = list()
        elif self._is_plaidml:
            vram = self._plaid.vram
        elif IS_MACOS:
            vram = [
                pynvx.cudaGetMemTotal(handle, ignore=True) / (1024 * 1024)
                for handle in self._handles
            ]
        else:
            vram = [
                pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024)
                for handle in self._handles
            ]
        self._log("debug", "GPU VRAM: {}".format(vram))
        return vram
Esempio n. 23
0
def auto_select_gpu():
  """Select gpu which has largest free memory"""
  if HAS_NVML:
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    largest_free_mem = 0
    largest_free_idx = 0
    for i in range(deviceCount):
      handle = pynvml.nvmlDeviceGetHandleByIndex(i)
      info = pynvml.nvmlDeviceGetMemoryInfo(handle)
      if info.free > largest_free_mem:
        largest_free_mem = info.free
        largest_free_idx = i
    pynvml.nvmlShutdown()
    largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

    idx_to_gpu_id = {}
    for i in range(deviceCount):
      idx_to_gpu_id[i] = '{}'.format(i)

    gpu_id = idx_to_gpu_id[largest_free_idx]
    logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
    return gpu_id
  else:
    logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
    return '0'
Esempio n. 24
0
def one_time():
    init_once()
    h = _pynvml_handles()
    return {
        "memory-total": pynvml.nvmlDeviceGetMemoryInfo(h).total,
        "name": pynvml.nvmlDeviceGetName(h).decode(),
    }
Esempio n. 25
0
    def setVisibleGpu(self):
        '''
        设置可用gpu编号
        '''
        num_gpu = self.opt.BASE.NUM_GPUS
        gpu_list = [str(i) for i in self.opt.BASE.GPU_ID]
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu_list[:num_gpu])
        '''检测gpu使用情况'''
        import pynvml
        pynvml.nvmlInit()
        # 这里的1是GPU id
        handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_list[0]))
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        total = meminfo.total  # 第二块显卡总的显存大小
        used = meminfo.used  # 这里是字节bytes,所以要想得到以兆M为单位就需要除以1024**2

        ratio = used / total
        if ratio > 0.5:
            flag = True
            while flag == True:
                ans = input(
                    "More than 50% resource has been occupied on GPU{0}, are you sure to continue?(y/n)"
                    .format(str(gpu_list[0])))
                if ans == 'n':
                    exit(0)
                elif ans == 'y':
                    flag = False
Esempio n. 26
0
def get_mem(device_handle):
    """Get GPU device memory consumption in percent."""
    try:
        memory_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        return memory_info.used * 100.0 / memory_info.total
    except pynvml.NVMLError:
        return None
Esempio n. 27
0
def log_gpu_memory():
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    info.free = round(info.free / 1024**2)
    info.used = round(info.used / 1024**2)
    print('GPU memory free: {}, memory used: {}'.format(info.free, info.used))
    return info.used
Esempio n. 28
0
def seeGmemorys(gpu_ids, tag=None):
    global count
    global old_Mb
    Mb = []

    pynvml.nvmlInit()
    for gpu_id in gpu_ids:
        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        m = meminfo.used / 1024 / 1024
        Mb.append(m)  ## B --> MB
    if (np.array(Mb) < 10000).any():
        SendMail(_subject='{} stop'.format(tag), _content='GPU free')
        return True
    # elif (np.array(Mb) > np.array(old_Mb)).any():
    # 	SendMail(_subject = '{} increase'.format(tag), _content = 'GPU increase')
    # 	return False
    else:
        count += 1
        old_Mb = Mb
        str = ', '.join([
            'id = {}, memory = {} Mb'.format(item[0], item[1])
            for _, item in enumerate(zip(gpu_ids, Mb))
        ])
        print('spy {} times. {}'.format(count, str))
        return False
Esempio n. 29
0
    def _select_device(gpu):
        import os

        from numpy import argmax

        logger = getLogger("clinicadl")

        if not gpu:
            return "cpu"
        else:
            # TODO: Add option gpu_device (user chooses the gpu)
            # How to perform multi-GPU ?
            try:
                # In this case, the GPU seen by cuda are restricted and we let cuda choose
                _ = os.environ["CUDA_VISIBLE_DEVICES"]
                return "cuda"
            except KeyError:
                # Else we choose ourselves the GPU with the greatest amount of memory
                from pynvml import (
                    nvmlDeviceGetHandleByIndex,
                    nvmlDeviceGetMemoryInfo,
                    nvmlInit,
                )

                nvmlInit()
                memory_list = [
                    nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)).free
                    for i in range(torch.cuda.device_count())
                ]
                free_gpu = argmax(memory_list)
                return f"cuda:{free_gpu}"
Esempio n. 30
0
    def track(self):
        """
        Track the GPU memory usage
        """
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(self.device)
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        self.curr_line = self.frame.f_lineno
        where_str = self.module_name + ' ' + self.func_name + ':' + ' line ' + str(self.curr_line)

        with open(self.gpu_profile_fn, 'a+') as f:

            if self.begin:
                f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |"
                        f" Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n")
                self.begin = False

            if self.print_detail is True:
                ts_list = [tensor.size() for tensor in self.get_tensors()]
                new_tensor_sizes = {(type(x), tuple(x.size()), ts_list.count(x.size()), np.prod(np.array(x.size()))*4/1000**2)
                                    for x in self.get_tensors()}
                for t, s, n, m in new_tensor_sizes - self.last_tensor_sizes:
                    f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20}\n')
                for t, s, n, m in self.last_tensor_sizes - new_tensor_sizes:
                    f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} \n')
                self.last_tensor_sizes = new_tensor_sizes

            f.write(f"\nAt {where_str:<50}"
                    f"Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n")

        pynvml.nvmlShutdown()
Esempio n. 31
0
 def get_free(self):
     """ Return the vram available """
     self.initialize()
     vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024)
             for handle in self.handles]
     self.shutdown()
     return vram
Esempio n. 32
0
    def _get_free_vram(self):
        """ Obtain the amount of VRAM that is available, in Megabytes, for each connected GPU.

        Returns
        -------
        list
             List of floats containing the amount of VRAM available, in Megabytes, for each
             connected GPU as corresponding to the values in :attr:`_handles

        Notes
        -----
        There is no useful way to get free VRAM on PlaidML. OpenCL loads and unloads VRAM as
        required, so this returns the total memory available per card for AMD cards, which us
        not particularly useful.

        """
        self._initialize()
        if self._is_plaidml:
            vram = self._plaid.vram
        elif IS_MACOS:
            vram = [
                pynvx.cudaGetMemFree(handle, ignore=True) / (1024 * 1024)
                for handle in self._handles
            ]
        else:
            vram = [
                pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024)
                for handle in self._handles
            ]
        self._shutdown()
        self._log("debug", "GPU VRAM free: {}".format(vram))
        return vram
Esempio n. 33
0
    def cb():
        nonlocal last_time
        now = time.time()
        src_dict = {"time": [now * 1000]}
        gpu_tot = 0
        mem_tot = 0
        tx_tot = 0
        rx_tot = 0
        for i in range(ngpus):
            gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
            mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used
            gpu_tot += gpu
            mem_tot += mem / (1024 * 1024)
            if pci_gen is not None:
                tx = (pynvml.nvmlDeviceGetPcieThroughput(
                    gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024)
                rx = (pynvml.nvmlDeviceGetPcieThroughput(
                    gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024)
                rx_tot += rx
                tx_tot += tx
            src_dict["gpu-" + str(i)] = [gpu]
            src_dict["memory-" + str(i)] = [mem]
        src_dict["gpu-total"] = [gpu_tot / ngpus]
        src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100]
        src_dict["tx-total"] = [tx_tot]
        src_dict["rx-total"] = [rx_tot]

        source.stream(src_dict, 1000)

        last_time = now
Esempio n. 34
0
def get_gpu_mem_used():
    try:
        from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        totalMemory = nvmlDeviceGetMemoryInfo(handle)
        return totalMemory.used
    except Exception:
        return -1
Esempio n. 35
0
    def get_used(self):
        """ Return the vram in use """
        self.initialize()
        vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).used / (1024 * 1024)
                for handle in self.handles]
        self.shutdown()

        if self.verbose:
            print("GPU VRAM used:    {}".format(vram))

        return vram
Esempio n. 36
0
 def get_free(self):
     """ Return the vram available """
     self.initialize()
     if IS_MACOS:
         vram = [pynvx.cudaGetMemFree(handle, ignore=True) / (1024 * 1024)
                 for handle in self.handles]
     else:
         vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024)
                 for handle in self.handles]
     self.shutdown()
     if self.logger:
         self.logger.debug("GPU VRAM free: %s", vram)
     return vram
Esempio n. 37
0
def get_memory_information(handle):
    mem_total = -1
    mem_used = -1
    mem_percent = -1
    try:
        memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        mem_total = memInfo.total / 1024 / 1024
        mem_used = memInfo.used / 1024 / 1024
        mem_percent = (float(memInfo.used) / memInfo.total) * 100.
    except Exception:
        pass

    return mem_used, mem_total, mem_percent
Esempio n. 38
0
    def get_used(self):
        """ Return the vram in use """
        self.initialize()
        if IS_MACOS:
            vram = [pynvx.cudaGetMemUsed(handle, ignore=True) / (1024 * 1024)
                    for handle in self.handles]
        else:
            vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).used / (1024 * 1024)
                    for handle in self.handles]
        self.shutdown()

        if self.logger:
            self.logger.verbose("GPU VRAM used: %s", vram)
        return vram
Esempio n. 39
0
 def get_vram(self):
     """ Return total vram in megabytes per device """
     self.initialize()
     if self.device_count == 0:
         vram = list()
     elif IS_MACOS:
         vram = [pynvx.cudaGetMemTotal(handle, ignore=True) / (1024 * 1024)
                 for handle in self.handles]
     else:
         vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).total /
                 (1024 * 1024)
                 for handle in self.handles]
     if self.logger:
         self.logger.debug("GPU VRAM: %s", vram)
     return vram
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Esempio n. 41
0
def request_mem(mem_mb, i_am_nice=True):
    # titanx' mem:        12,881,559,552 bytes
    # 12*1024*1024*1024 = 12,884,901,888
    mem = mem_mb * 1024 * 1024
    nvml.nvmlInit()
    # n = nvml.nvmlDeviceGetCount()
    try:
        handle = nvml.nvmlDeviceGetHandleByIndex(0)
        info   = nvml.nvmlDeviceGetMemoryInfo(handle)
        cap = info.total * nice_ratio
        # req = cap if mem > cap and i_am_nice else mem
        req = mem
        if req > cap and i_am_nice:
            raise MemoryError('You are supposed to be polite..')
        if req > info.free:
            raise MemoryError('Cannot fullfil the gpumem request')
        return req / info.free
    finally:
        nvml.nvmlShutdown()
Esempio n. 42
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Esempio n. 43
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Esempio n. 44
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info
Esempio n. 45
0
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%"+"(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Esempio n. 46
0
 def info_refresh(self):
     
     try:
         stat = open("/proc/stat")
         self.statlines = stat.read().splitlines()[1:-1]
         stat.close()
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         for j in self.statlines[i].split()[1:]: #remove cpu#
            self.total[i]+= int(j)
         self.idle[i] = int(self.statlines[i].split()[4])
     
     for i in range(self.corecount):
         if (self.total[i] - self.prev_total[i]) == 0:
             self.prev_idle[i] = self.idle[i]
             self.prev_total[i] = self.total[i]
             break
         
         self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) )
         self.prev_idle[i] = self.idle[i]
         self.prev_total[i] = self.total[i]
         self.idle[i] = 0
         self.total[i] = 0
     
     for i in range(self.deviceCount):
         
         util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i])
         temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU)
         memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i])
         (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i])
         (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i])
         
         mem_total = memInfo.total / 1024 / 1024
         mem_used = memInfo.used / 1024 / 1024
         
         self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu)
         self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100)
         ########
         self.util_history.append(util.gpu)
         self.util_graph.queue_draw()
         
         self.temp_history.append(temp)
         self.temp_graph.queue_draw()
         ########
         self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory)
         self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100)
         
         self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util)
         self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util)
         self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100)
         self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100)
         
         self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total))
         self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total)
         
         self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp)
         if temp > 100:
            temp = 100
         elif temp < 0:
             temp = 0
         self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100)
         
         
     #--proc--
     procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0])
     
     proc_liststore = Gtk.ListStore(int, str, int)
     
     for p in procs:
         pid = p.pid
         try:
             path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8')
         except:
             self.exit()
         if (p.usedGpuMemory == None):
             mem = 0
         else:
             mem = (p.usedGpuMemory / 1024 / 1024)
         proc_liststore.append([pid, path, mem])
     self.tree.set_model(proc_liststore)
     return True
Esempio n. 47
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Esempio n. 48
0
 def get_vram(self):
     """ Return total vram in megabytes per device """
     vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024)
             for handle in self.handles]
     return vram
Esempio n. 49
0
def printGPUINFO():
    gpu_id = config.GPU_ID
    gpu_obj = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
    print ("gup mem used:", pynvml.nvmlDeviceGetMemoryInfo(gpu_obj).used/1024/1024, "MB")