Beispiel #1
0
    def _get_gpu_status(self, used_gpu_indexes):
        """ Get the status of the currently used GPUs.

        Args:
            used_gpu_indexes: (list)

        Returns:
            gpu_status: (list)

        """
        gpu_status = list()
        nvmlInit()

        for index in used_gpu_indexes:
            handle = nvmlDeviceGetHandleByIndex(index)
            utilization_rates = nvmlDeviceGetUtilizationRates(handle)
            mem_info = nvmlDeviceGetMemoryInfo(handle)
            mem_usage = mem_info.used / mem_info.total
            status = {
                "index": index,
                "gpu_util": utilization_rates.gpu,
                "mem_usage": mem_usage
            }
            gpu_status.append(status)

        nvmlShutdown()
        return gpu_status
Beispiel #2
0
def gpu_info() -> dict:
    info = dict()

    try:
        nvmlInit()
    except NVMLError:
        info['no-gpu'] = 'No Nvidia GPU detected'
        return info

    device_count = nvmlDeviceGetCount()

    info['driver_version'] = nvmlSystemGetDriverVersion().decode()
    info['device_count'] = device_count
    info['device'] = dict()
    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        memory = nvmlDeviceGetMemoryInfo(handle)

        info['device'][i] = dict()
        info['device'][i]['name'] = str(nvmlDeviceGetName(handle))

        info['device'][i]['memory'] = dict()

        info['device'][i]['memory']['total'] = str(size_in_gb(memory.total))

    nvmlShutdown()

    return info
Beispiel #3
0
    def gpu_info(self):
        # pip install nvidia-ml-py3
        if len(self.gpu_ids) >= 0 and torch.cuda.is_available():
            try:
                import pynvml
                pynvml.nvmlInit()
                self.config_dic[
                    'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion(
                    )
                for gpu_id in self.gpu_ids:
                    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                    gpu_id_name = "gpu%s" % gpu_id
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    self.config_dic['%s_device_name' %
                                    gpu_id_name] = pynvml.nvmlDeviceGetName(
                                        handle)
                    self.config_dic['%s_mem_total' %
                                    gpu_id_name] = gpu_mem_total = round(
                                        mem_info.total / 1024**3, 2)
                    self.config_dic['%s_mem_used' %
                                    gpu_id_name] = gpu_mem_used = round(
                                        mem_info.used / 1024**3, 2)
                    # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2
                    self.config_dic['%s_mem_percent' % gpu_id_name] = round(
                        (gpu_mem_used / gpu_mem_total) * 100, 1)
                    self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name,
                                          gpu_utilize.gpu, 0.8)
                    # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu
                    # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory

                pynvml.nvmlShutdown()
            except Exception as e:
                print(e)
def auto_select_gpu():
  """Select gpu which has largest free memory"""
  if HAS_NVML:
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    largest_free_mem = 0
    largest_free_idx = 0
    for i in range(deviceCount):
      handle = pynvml.nvmlDeviceGetHandleByIndex(i)
      info = pynvml.nvmlDeviceGetMemoryInfo(handle)
      if info.free > largest_free_mem:
        largest_free_mem = info.free
        largest_free_idx = i
    pynvml.nvmlShutdown()
    largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

    idx_to_gpu_id = {}
    for i in range(deviceCount):
      idx_to_gpu_id[i] = '{}'.format(i)

    gpu_id = idx_to_gpu_id[largest_free_idx]
    logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
    return gpu_id
  else:
    logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
    return '0'
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description="collect GPU device memory usage")
    parser.add_argument("-g",
                        type=int,
                        default=1,
                        help="number of gpu devices")
    parser.add_argument("-n", type=float, default=1, help="metrics rate")
    args = parser.parse_args()

    pynvml.nvmlInit()
    n_gpus = args.g
    devices = [
        Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)
    ]

    running = True
    while running:
        time.sleep(args.n)
        running = False
        for device in devices:
            running |= device.update()

    pynvml.nvmlShutdown()
    for i, device in enumerate(devices):
        max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024
        print(f"gpt{i} max memory usage: {max_mem_usage_mbytes:.2f}M")
Beispiel #6
0
 def shutdown(self):
     """ Shutdown pynvml """
     if self.initialized:
         self.handles = None
         if not IS_MACOS:
             pynvml.nvmlShutdown()
         self.initialized = False
 def autoselect(gpu_target: List[int], min_memory: float) -> int:
     logging.info(f'GPU search space: {gpu_target}')
     nvmlInit()
     deviceCount = nvmlDeviceGetCount()
     memories = np.zeros((deviceCount, COUNT), dtype=np.float32)
     rates = np.zeros((deviceCount, COUNT), dtype=np.float32)
     for c in range(COUNT):
         for i in range(deviceCount):
             if i not in gpu_target:
                 memories[i, c] = 0
                 rates[i, c] = 100
             else:
                 handle = nvmlDeviceGetHandleByIndex(i)
                 memories[
                     i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3
                 rates[i,
                       c] = int(nvmlDeviceGetUtilizationRates(handle).gpu)
         time.sleep(INTERVAL)
     nvmlShutdown()
     memories = memories.mean(1)
     rates = rates.mean(1)
     # enough memory GPU ids
     memory_enough_ids = np.where(memories > min_memory)[0]
     if len(memory_enough_ids) > 0:
         # min util GPU
         gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])]
         # if multi GPUs' util are the same, choose one that has the most memory
         gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0]
         gpu_min_ids = memory_enough_ids[gpu_min_ids]
         gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])]
         logging.info(f'Auto select GPU {gpuid}')
     else:
         raise MemoryError(str(memories))
     return int(gpuid)
Beispiel #8
0
def gpus_available() -> Dict[int, float]:
    if not torch.cuda.is_available():
        return dict()
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices is None:
            visible_devices = list(range(nvmlDeviceGetCount()))
        else:
            visible_devices = {int(x.strip()) for x in visible_devices.split(',')}
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True))
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return dict((i, 1.0) for i in range(torch.cuda.device_count()))
def gpus_available() -> dict:
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices:
            visible_devices = {
                int(x.strip())
                for x in visible_devices.split(',')
            }
        else:
            visible_devices = list(range(nvmlDeviceGetCount()))
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return gpus
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return {}
Beispiel #10
0
 def shutdown(self):
     """ Shutdown pynvml """
     if self.initialized:
         self.handles = list()
         if not IS_MACOS and not self.plaid:
             pynvml.nvmlShutdown()
         self.initialized = False
Beispiel #11
0
def auto_select_gpu():
    """Select gpu which has largest free memory"""
    if HAS_NVML:
        pynvml.nvmlInit()
        deviceCount = pynvml.nvmlDeviceGetCount()
        largest_free_mem = 0
        largest_free_idx = 0
        for i in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            if info.free > largest_free_mem:
                largest_free_mem = info.free
                largest_free_idx = i
        pynvml.nvmlShutdown()
        largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

        idx_to_gpu_id = {}
        for i in range(deviceCount):
            idx_to_gpu_id[i] = '{}'.format(i)

        gpu_id = idx_to_gpu_id[largest_free_idx]
        logging.info(
            'Using largest free memory GPU {} with free memory {}MB'.format(
                gpu_id, largest_free_mem))
        return gpu_id
    else:
        logging.info(
            'nvidia-ml-py is not installed, automatically select gpu is disabled!'
        )
        return '0'
Beispiel #12
0
def get_num_gpus():
    import pynvml

    pynvml.nvmlInit()
    ngpus = pynvml.nvmlDeviceGetCount()
    pynvml.nvmlShutdown()
    return ngpus
Beispiel #13
0
 def shutdown(self):
     """ Shutdown pynvml """
     if self.initialized:
         self.handles = None
         if not IS_MACOS:
             pynvml.nvmlShutdown()
         self.initialized = False
Beispiel #14
0
    def track(self):
        """
        Track the GPU memory usage
        """
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(self.device)
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        self.curr_line = self.frame.f_lineno
        where_str = self.module_name + ' ' + self.func_name + ':' + ' line ' + str(self.curr_line)

        with open(self.gpu_profile_fn.replace(':', ''), 'a+') as f:

            if self.begin:
                f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |"
                        f" Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n")
                self.begin = False

            if self.print_detail is True:
                ts_list = [tensor.size() for tensor in self.get_tensors()]
                new_tensor_sizes = {(type(x), tuple(x.size()), ts_list.count(x.size()), np.prod(np.array(x.size()))*4/1000**2)
                                    for x in self.get_tensors()}
                for t, s, n, m in new_tensor_sizes - self.last_tensor_sizes:
                    f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20}\n')
                for t, s, n, m in self.last_tensor_sizes - new_tensor_sizes:
                    f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} \n')
                self.last_tensor_sizes = new_tensor_sizes

            f.write(f"\nAt {where_str:<50}"
                    f"Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n")

        pynvml.nvmlShutdown()
Beispiel #15
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #16
0
 def wrap(*arg, **kwargs):
     try:
         result = func(*arg, **kwargs)
         nvmlShutdown()
     except Exception:
         pass
     else:
         return result
Beispiel #17
0
def get_gpu_count():
    """
    return the gpu number
    """
    pynvml.nvmlInit()
    gpu_number = pynvml.nvmlDeviceGetCount()
    pynvml.nvmlShutdown()
    return gpu_number
Beispiel #18
0
 def _shutdown(self):
     """ Shutdown pynvml if it was the library used for obtaining stats and set
     :attr:`_initialized` back to ``False``. """
     if self._initialized:
         self._handles = list()
         if not IS_MACOS and not self._is_plaidml:
             pynvml.nvmlShutdown()
         self._initialized = False
Beispiel #19
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Beispiel #20
0
def memory_info():
    """
    Assumes identical GPUs in a node
    """
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle).total
    pynvml.nvmlShutdown()
    return gpu_mem
Beispiel #21
0
def _nvml():
    """Enter a context manager that will init and shutdown nvml."""
    # Copyright (c) 2018 Bohumír Zámečník, Rossum Ltd., MIT license
    # from https://github.com/rossumai/nvgpu/blob/a66dda5ae816a6a8936645fe0520cb4dc6354137/nvgpu/nvml.py#L5
    # Modifications copyright 2019, Nathan Hunt, MIT license

    nv.nvmlInit()
    yield
    nv.nvmlShutdown()
Beispiel #22
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Beispiel #23
0
def run_logging_loop(async_task, async_loop):
    asyncio.set_event_loop(async_loop)
    pynvml.nvmlInit()
    logger = _logger()
    logger.info("Driver Version: {}".format(
        nativestr(pynvml.nvmlSystemGetDriverVersion())))
    async_loop.run_until_complete(async_task)
    logger.info("Shutting down driver")
    pynvml.nvmlShutdown()
Beispiel #24
0
def log_gpu_stat(logger):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    used_mem = (meminfo.used / 1024) /1024
    pynvml.nvmlShutdown()
    gpu_info = subprocess.check_output(["nvidia-smi"])
    logger.info(("\nThe pid of current job is {} and {}, the used memory before we run is {}MB,"+\
        " the <nvidia-smi> shows:\n{}").format(os.getpid(),os.getppid(), used_mem, gpu_info.decode("utf-8")))
Beispiel #25
0
    def end(self, session):
        """Called at the end of a session.

        Arguments:
            session (tf.Session):
                The `session` argument can be used in case the hook wants to run final ops,
                such as saving a last checkpoint.
        """
        # Shutdown the NVML interface.
        nvml.nvmlShutdown()
Beispiel #26
0
    def exit(self):
        """Overwrite the exit method to close the GPU API."""
        if self.nvml_ready:
            try:
                pynvml.nvmlShutdown()
            except Exception as e:
                logger.debug("pynvml failed to shutdown correctly ({})".format(e))

        # Call the father exit method
        super(Plugin, self).exit()
Beispiel #27
0
 def __customCurveSpeed(self):
     nvmlInit()
     self._handle = nvmlDeviceGetHandleByIndex(self.id)
     curve = Curve()
     while (not self.stopped()):
         current_temp = self.__getTemp()
         new_fan_speed = curve.evaluate(current_temp)
         self.__setSpeed(new_fan_speed)
         time.sleep(1.0)
     nvmlShutdown()
Beispiel #28
0
    def exit(self):
        """Overwrite the exit method to close the GPU API."""
        if self.nvml_ready:
            try:
                pynvml.nvmlShutdown()
            except Exception as e:
                logger.debug("pynvml failed to shutdown correctly ({})".format(e))

        # Call the father exit method
        super(Plugin, self).exit()
Beispiel #29
0
def run_hardware_monitor(sv: SharedValues):
    print(time.strftime(LOG_TIME), "Hardware monitoring starts")
    try:
        if GPU_MODE:
            pynvml.nvmlInit()
        hw_info.SSEUpdater.broadcast_sys_info(sv)
    except KeyboardInterrupt:
        print(time.strftime(LOG_TIME), "Hardware monitoring stops")
        if GPU_MODE:
            pynvml.nvmlShutdown()
Beispiel #30
0
def get_gpu_status(gpu_index=0):
    # init for getting
    N.nvmlInit()
    handle = N.nvmlDeviceGetHandleByIndex(gpu_index)

    def _decode(b):
        if isinstance(b, bytes):
            return b.decode()  # to unicode
        return b

    name = _decode(N.nvmlDeviceGetName(handle))
    uuid = _decode(N.nvmlDeviceGetUUID(handle))

    try:
        temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
    except N.NVMLError:
        temperature = None

    try:
        memory = N.nvmlDeviceGetMemoryInfo(handle)
    except N.NVMLError:
        memory = None

    try:
        utilization = N.nvmlDeviceGetUtilizationRates(handle)
    except N.NVMLError:
        utilization = None

    try:
        power = N.nvmlDeviceGetPowerUsage(handle)
    except:
        power = None

    try:
        power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
    except:
        power_limit = None

    # real gpu index
    index = N.nvmlDeviceGetIndex(handle)
    gpu_info = {
        'index': index,
        'uuid': uuid,
        'name': name,
        'temperature': temperature,
        'utilization': utilization.gpu if utilization else None,
        'power': int(power / 1000) if power is not None else None,
        'enforced.power': int(power_limit / 1000) if power_limit is not None else None,
        # Convert bytes into MBytes
        'memory.used': int(memory.used / 1024 / 1024) if memory else None,
        'memory.total': int(memory.total / 1024 / 1024) if memory else None,
    }
    # release resource
    N.nvmlShutdown()
    return GPUStat(gpu_info)
Beispiel #31
0
    def func0(memory_require=128 * 1024 * 1024,
              tf_gpu_mem_growth=False,
              logger=None,
              console=True):
        try:
            gpu = None
            pynvml.nvmlInit()
            gpu_num = pynvml.nvmlDeviceGetCount()
            # check nvidia driver
            import tensorflow as tf
            gpus = tf.config.experimental.list_physical_devices('GPU')
            del tf
            if gpu_num <= 0 or len(gpus) <= 0:
                pynvml.nvmlShutdown()
                if len(gpus) <= 0 and gpu_num > 0:
                    msg = "have {} GPU, but tensorflow can not detect, check driver or tensorflow if GPU version".format(
                        gpu_num)
                else:
                    msg = "NO GPU"

                if logger:
                    logger.i(msg)
                if console:
                    print(msg)
                return gpu

            for i in range(gpu_num):
                h = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(h)
                info = pynvml.nvmlDeviceGetMemoryInfo(h)
                msg = "GPU:{}, used:{}/{}MB, free:{}MB".format(
                    name.decode(), info.used / 1024 / 1024,
                    info.total / 1024 / 1024, info.free / 1024 / 1024)
                if logger:
                    logger.i(msg)
                if console:
                    print(msg)
                if info.free >= memory_require:
                    gpu = GPU_info(id=i,
                                   name=name.decode(),
                                   mem_free=info.free,
                                   mem_total=info.total)
                    os.environ["CUDA_VISIBLE_DEVICES"] = str(i)
                    import tensorflow as tf
                    tf.config.experimental.set_memory_growth(gpus[i], True)
                    del tf
                    break
            pynvml.nvmlShutdown()
        except Exception as e:
            msg = "select gpu fail:{}".format(e)
            if logger:
                logger.i(msg)
            if console:
                print(msg)
        return gpu
Beispiel #32
0
def check_nvidia_device():
    try:
        pynvml.nvmlInit()
        driver_version = float(pynvml.nvmlSystemGetDriverVersion())
        pynvml.nvmlShutdown()
        if driver_version < 367.48:
            raise OSError(
                'NVIDIA driver v.{} is not supported. The driver version must be 367.48 or newer'
                .format(driver_version))
    except pynvml.NVMLError:
        raise OSError('NVIDIA device not found')
Beispiel #33
0
    def get_gpu_machine(self) -> GPUMachine:

        # from pynvml.smi import nvidia_smi
        # nvsmi = nvidia_smi.getInstance()
        # gpu_info = nvsmi.DeviceQuery('index, utilization.gpu, memory.free, count')
        # gpu_machine = GPUMachine(gpu_info["count"])
        #
        # for one_gpu in gpu_info["gpu"]:
        #     gpu_machine.add_gpu_state(
        #         GPUState(free=one_gpu["fb_memory_usage"]["free"],
        #                  util=one_gpu["utilization"]["gpu_util"],
        #                  index=int(one_gpu["minor_number"])
        #                  )
        #     )

        import pynvml

        MB = 1024 * 1024

        pynvml.nvmlInit()

        device_count = pynvml.nvmlDeviceGetCount()
        gpu_machine = GPUMachine(device_count)

        pynvml.nvmlShutdown()

        for index in range(device_count):
            pynvml.nvmlInit()

            handle = pynvml.nvmlDeviceGetHandleByIndex(index)

            index = pynvml.nvmlDeviceGetIndex(handle)

            try:
                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
            except pynvml.NVMLError:
                utilization = None  # Not supported

            try:
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except pynvml.NVMLError:
                memory = None  # Not supported

            pynvml.nvmlShutdown()

            gpu_machine.add_gpu_state(
                GPUState(free=memory.free // MB,
                         util=utilization.gpu,
                         index=index
                         )
            )

        return gpu_machine
Beispiel #34
0
def clean_up():
    global _nvml_inited
    if _nvml_inited:
        try:
            pynvml.nvmlShutdown()
            logger.info('[NVML] NVML Shutdown')
        except pynvml.NVMLError as e:
            logger.error('[NVML] NVML Failed to Shutdown: %s' % str(e))
            pass
    _nvml_inited = False
    _static_info['public'] = {}
    _static_info['private'] = {}
Beispiel #35
0
def get_nvml_driver_version():
    try:
        from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion
        try:
            nvmlInit()
            v = nvmlSystemGetDriverVersion()
            log("nvmlSystemGetDriverVersion=%s", v)
            return v.split(".")
        except Exception as e:
            log.warn("Warning: failed to query the NVidia kernel module version via NVML:")
            log.warn(" %s", e)
        finally:
            nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return ""
Beispiel #36
0
def request_mem(mem_mb, i_am_nice=True):
    # titanx' mem:        12,881,559,552 bytes
    # 12*1024*1024*1024 = 12,884,901,888
    mem = mem_mb * 1024 * 1024
    nvml.nvmlInit()
    # n = nvml.nvmlDeviceGetCount()
    try:
        handle = nvml.nvmlDeviceGetHandleByIndex(0)
        info   = nvml.nvmlDeviceGetMemoryInfo(handle)
        cap = info.total * nice_ratio
        # req = cap if mem > cap and i_am_nice else mem
        req = mem
        if req > cap and i_am_nice:
            raise MemoryError('You are supposed to be polite..')
        if req > info.free:
            raise MemoryError('Cannot fullfil the gpumem request')
        return req / info.free
    finally:
        nvml.nvmlShutdown()
Beispiel #37
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Beispiel #38
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                props = {}
                def meminfo(memory):
                    return {
                            "total"  : int(memory.total),
                            "free"   : int(memory.free),
                            "used"   : int(memory.used),
                            }
                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i
                for prop, fn_name, args, conv in (
                       ("name",                     "nvmlDeviceGetName",                    (),     str),
                       ("serial",                   "nvmlDeviceGetSerial",                  (),     str),
                       ("uuid",                     "nvmlDeviceGetUUID",                    (),     str),
                       ("pci",                      "nvmlDeviceGetPciInfo",                 (),     pciinfo),
                       ("memory",                   "nvmlDeviceGetMemoryInfo",              (),     meminfo),
                       ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration",   (),     int),
                       ("pcie-link-width-max",      "nvmlDeviceGetMaxPcieLinkWidth",        (),     int),
                       ("pcie-link-generation",     "nvmlDeviceGetCurrPcieLinkGeneration",  (),     int),
                       ("pcie-link-width",          "nvmlDeviceGetCurrPcieLinkWidth",       (),     int),
                       ("clock-info-graphics",      "nvmlDeviceGetClockInfo",               (0,),   int),
                       ("clock-info-sm",            "nvmlDeviceGetClockInfo",               (1,),   int),
                       ("clock-info-mem",           "nvmlDeviceGetClockInfo",               (2,),   int),
                       ("clock-info-graphics-max",  "nvmlDeviceGetMaxClockInfo",            (0,),   int),
                       ("clock-info-sm-max",        "nvmlDeviceGetMaxClockInfo",            (1,),   int),
                       ("clock-info-mem-max",       "nvmlDeviceGetMaxClockInfo",            (2,),   int),
                       ("fan-speed",                "nvmlDeviceGetFanSpeed",                (),     int),
                       ("temperature",              "nvmlDeviceGetTemperature",             (0,),   int),
                       ("power-state",              "nvmlDeviceGetPowerState",              (),     int),
                       ("vbios-version",            "nvmlDeviceGetVbiosVersion",            (),     str),
                       ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e)
                        continue
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Beispiel #39
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Beispiel #40
0
 def init(self):
     
     self.util_history = []
     self.temp_history = []
     pynvml.nvmlInit()
     self.gpu_handles = []
     self.deviceCount = pynvml.nvmlDeviceGetCount()
     
     for i in range(self.deviceCount):
         self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
     
     self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6)
     self.cpu_prog_bars = []
     self.gpu_boxes = []
     self.gpu_prog_bars = []
     
     self.prev_idle = []
     self.prev_total = []
     self.idle = []
     self.total = []
     
     #---cpu_box---
     try:
         stat = open("/proc/stat")
         
         statlines = stat.read().splitlines()
         stat.close()
         
         self.corecount = -1
         
         for line in statlines:
             if (line[0:2] == "cp"):
                 self.corecount+= 1
             else:
                 break
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True))
         self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0)
         
         self.prev_idle.append(0)
         self.prev_total.append(0)
         self.idle.append(0)
         self.total.append(0)
     
     #---gpu_boxes---
     for i in range(self.deviceCount):
         product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i])
         product_name = product_name.decode('utf-8')
         
         gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
         
         label = Gtk.Label(product_name)
         
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True))
         
         gpu_box.pack_start(label, True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0)
         
         self.gpu_boxes.append(gpu_box)
     
     #---proc---
     proc_liststore = Gtk.ListStore(int, str, int)
     
     self.tree = Gtk.TreeView(model=proc_liststore)
     
     renderer_pid = Gtk.CellRendererText()
     column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0)
     column_pid.set_resizable(True)
     self.tree.append_column(column_pid)
     
     renderer_path = Gtk.CellRendererText()
     column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1)
     column_path.set_resizable(True)
     column_path.set_fixed_width(250)
     self.tree.append_column(column_path)
     
     renderer_mem = Gtk.CellRendererText()
     column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2)
     column_mem.set_resizable(True)
     self.tree.append_column(column_mem)
Beispiel #41
0
def count_gpus():
    nvmlInit()
    count = nvmlDeviceGetCount()
    nvmlShutdown()
    return count
Beispiel #42
0
 def info_refresh(self):
     
     try:
         stat = open("/proc/stat")
         self.statlines = stat.read().splitlines()[1:-1]
         stat.close()
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         for j in self.statlines[i].split()[1:]: #remove cpu#
            self.total[i]+= int(j)
         self.idle[i] = int(self.statlines[i].split()[4])
     
     for i in range(self.corecount):
         if (self.total[i] - self.prev_total[i]) == 0:
             self.prev_idle[i] = self.idle[i]
             self.prev_total[i] = self.total[i]
             break
         
         self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) )
         self.prev_idle[i] = self.idle[i]
         self.prev_total[i] = self.total[i]
         self.idle[i] = 0
         self.total[i] = 0
     
     for i in range(self.deviceCount):
         
         util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i])
         temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU)
         memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i])
         (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i])
         (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i])
         
         mem_total = memInfo.total / 1024 / 1024
         mem_used = memInfo.used / 1024 / 1024
         
         self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu)
         self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100)
         ########
         self.util_history.append(util.gpu)
         self.util_graph.queue_draw()
         
         self.temp_history.append(temp)
         self.temp_graph.queue_draw()
         ########
         self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory)
         self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100)
         
         self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util)
         self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util)
         self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100)
         self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100)
         
         self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total))
         self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total)
         
         self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp)
         if temp > 100:
            temp = 100
         elif temp < 0:
             temp = 0
         self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100)
         
         
     #--proc--
     procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0])
     
     proc_liststore = Gtk.ListStore(int, str, int)
     
     for p in procs:
         pid = p.pid
         try:
             path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8')
         except:
             self.exit()
         if (p.usedGpuMemory == None):
             mem = 0
         else:
             mem = (p.usedGpuMemory / 1024 / 1024)
         proc_liststore.append([pid, path, mem])
     self.tree.set_model(proc_liststore)
     return True
Beispiel #43
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
 def _shutdown_nvml(self):
     try:
         pynvml.nvmlShutdown()
     except pynvml.NVMLError, err:
         logger.debug('Failed to shutdown NVML: ', err)
Beispiel #45
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #46
0
 def exit(self, widget, ev):
     pynvml.nvmlShutdown()
     Gtk.main_quit()
     quit()