Esempio n. 1
0
 def get():
     handles = []
     output = []
     for device_id in nvidia_smi.nvmlDeviceGetCount():
         handles.append(nvidia_smi.nvmlDeviceGetHandleByIndex(device_id))
     for handle in handles:
         res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
         output.append({'usage': res.gpu, 'memory': res.memory})
     return output
def log_gpu_memory_to_tensorboard():
    '''
    Log every gpus current free memory level to tensorboard.
    '''
    for i in range(nvidia_smi.nvmlDeviceGetCount()):
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(gpus[i])
        with loggers[i].as_default():
            tl.summary({'free': np.array(info.free) / (1024**3)},
                       step=int(time.time()),
                       name='GPUs')
Esempio n. 3
0
def check_cuda_memory():
    nvidia_smi.nvmlInit()

    deviceCount = nvidia_smi.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)"\
              .format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, \
                      info.total, info.free, info.used))
    nvidia_smi.nvmlShutdown()
    return
Esempio n. 4
0
    def __init__(self, exclude_gpu_ids: list = []):
        """
        Usage:

        g = GPUAllocator()
        gpu_id = g.get_gpu()

        ## do something with gpu_id 

        g.set_as_free(gpu_id)

        """

        nvidia_smi.nvmlInit()

        self.num_gpus = nvidia_smi.nvmlDeviceGetCount()
        self.gpu_names =  []

        for i in range(self.num_gpus):
            if i in exclude_gpu_ids:
                pass
            else:
                self.gpu_names.append('cuda:' +  str(i))

        self.usage = {}

        for i in range(self.num_gpus):
            if i in exclude_gpu_ids:
                pass
            else:
                self.usage[i] = False

        """
        on a good day, this is how the variables look like: 

        self.num_gpus= 2

        self.gpu_names= [
            'cuda:0', 
            'cuda:1'
        ]

        self.usage= {
            0: False,
            1: False
        }
        
        """

        print( "[" + Colors.CYAN+ "EDEN" +Colors.END+ "] " + 'Initialized GPUAllocator with devices: ', self.gpu_names)

        """
def init_gpu_writers(logdir):
    global gpus, loggers
    '''
    Set up tensorboard file writers.
    '''
    for i in range(nvidia_smi.nvmlDeviceGetCount()):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        gpus.append(handle)
        name = nvidia_smi.nvmlDeviceGetName(handle).decode().replace(
            ' ', '-') + ':' + str(int(i))
        loggers.append(
            tf.summary.create_file_writer(
                os.path.join(logdir,
                             os.uname().nodename, name)))
Esempio n. 6
0
def get_SystemStats(process, NVIDIA_GPU):
    if NVIDIA_GPU:
        deviceCount = nvidia_smi.nvmlDeviceGetCount()
        gpu_memory = []
        gpu_utilization = []
        for i in range(0, deviceCount):
            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
            gpu_stat = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
            gpu_memory.append(gpu_stat.memory)
            gpu_utilization.append(gpu_stat.gpu)
    else:
        gpu_memory = []
        gpu_utilization = []

    sys_memory = process.memory_info()[0] / 2. ** 30

    return gpu_memory, gpu_utilization, sys_memory
Esempio n. 7
0
def set_cluster_client(n_gpus=-1, device_spill_frac=0.8):
    # TODO: Check for any solution. If user calls this function, for the second call the correct recreation will fail.
    # New cluster can be created after 'kernel restart' procedure.
    '''
        device_spill_frac: Spill GPU-Worker memory to host at this limit. Reduce if spilling fails to prevent device memory errors.
        '''
    if os.path.isdir("dask-worker-space"):
        shutil.rmtree('dask-worker-space', ignore_errors=True)
    # Deploy a Single-Machine Multi-GPU Cluster
    if n_gpus == -1:
        nvidia_smi.nvmlInit()
        n_gpus_avail = nvidia_smi.nvmlDeviceGetCount()
        print('\n n_gpus_avail: {}'.format(n_gpus_avail))
        n_gpus = n_gpus_avail
    # Delect devices to place workers
    visible_devices = [i for i in list(range(n_gpus))]
    visible_devices = str(visible_devices)[1:-1]
    #print('visible_devices: {}'.format(visible_devices))

    #TODO: how to reinitialzed cluster
    cluster = LocalCUDACluster(
        protocol="tcp",  # "tcp" or "ucx"
        CUDA_VISIBLE_DEVICES=visible_devices,
        device_memory_limit=device_spill_frac * device_mem_size(kind="total"),
    )
    try:
        # Create the distributed client
        client = Client(cluster)
        display(client)
        print('\n Dashboard avail: http://localhost:8888/proxy/8787/status')

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                pool_allocator=True,
                initial_pool_size=None,  # Use default size
            )

        client.run(_rmm_pool)
        return client
    except MemoryError:
        print('\n The client is already initialized')
Esempio n. 8
0
def nvapi():
    nvmlInit()
    ret = {}
    n_gpus = int(nvmlDeviceGetCount())
    ret['n_gpus'] = n_gpus
    for i in range(n_gpus):
        gpu_str = '{}.'.format(i)
        gpu_obj = nvmlDeviceGetHandleByIndex(i)
        ret[gpu_str + 'temp'] = nvmlDeviceGetTemperature(
            gpu_obj, NVML_TEMPERATURE_GPU)
        this_ram = nvmlDeviceGetMemoryInfo(gpu_obj)
        ret[gpu_str + 'ram.used'] = this_ram.used / MB
        ret[gpu_str + 'ram.total'] = this_ram.total / MB
        ret[gpu_str +
            'power.current'] = nvmlDeviceGetPowerUsage(gpu_obj) / 1000.0
        ret[gpu_str +
            'power.limit'] = nvmlDeviceGetEnforcedPowerLimit(gpu_obj) / 1.0
        ret[gpu_str +
            'util'] = nvmlDeviceGetUtilizationRates(gpu_obj).gpu / 1.0
    nvmlShutdown()
    return ret
Esempio n. 9
0
    def __get_gpu_info(self):
        def parse_unit(val, scale=1000):
            unit_ls = ['B', 'KB', 'MB', 'GB']
            unit_lv = 0
            while val >= scale:
                val /= scale
                unit_lv += 1
                if unit_lv == len(unit_ls) - 1:
                    break
            return '{:.2f} {}'.format(val, unit_ls[unit_lv])

        sum_info = []
        process_ls = []

        nv.nvmlInit()
        gpu_num = nv.nvmlDeviceGetCount()
        # 遍历每块卡
        for gpu_idx in range(gpu_num):
            h = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
            dev_name = nv.nvmlDeviceGetName(h).decode()
            raw_total_mem = nv.nvmlDeviceGetMemoryInfo(h).total
            total_mem = parse_unit(raw_total_mem, 1024)
            raw_used_mem = nv.nvmlDeviceGetMemoryInfo(h).used
            used_mem = parse_unit(raw_used_mem, 1024)
            gpu_util = '{:.2f}'.format(nv.nvmlDeviceGetUtilizationRates(h).gpu)
            gpu_mem_util = '{:.2f}'.format(raw_used_mem * 100 / raw_total_mem)

            tmp = {}
            tmp['gpu_idx'] = str(gpu_idx)
            tmp['dev_name'] = dev_name
            tmp['total_mem'] = total_mem
            tmp['used_mem'] = used_mem
            tmp['gpu_util'] = gpu_util
            tmp['gpu_mem_util'] = gpu_mem_util
            sum_info.append(tmp)

            running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_type = 'C'
                process_raw_gpu_mem = obj.usedGpuMemory
                process_name = nv.nvmlSystemGetProcessName(
                    process_pid).decode()
                ctan_name = self.get_ctan_name_by_pid(process_pid)

                tmp = {}
                tmp['gpu_idx'] = str(gpu_idx)
                tmp['dev_name'] = dev_name
                tmp['process_pid'] = str(process_pid)
                tmp['process_type'] = process_type
                tmp['process_name'] = process_name
                tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024)
                tmp['ctan_name'] = ctan_name
                process_ls.append(tmp)

            running_process_obj_ls = nv.nvmlDeviceGetGraphicsRunningProcesses(
                h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_type = 'G'
                process_raw_gpu_mem = obj.usedGpuMemory
                process_name = nv.nvmlSystemGetProcessName(
                    process_pid).decode()
                ctan_name = self.get_ctan_name_by_pid(process_pid)

                tmp = {}
                tmp['gpu_idx'] = str(gpu_idx)
                tmp['dev_name'] = dev_name
                tmp['process_pid'] = str(process_pid)
                tmp['process_type'] = process_type
                tmp['process_name'] = process_name
                tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024)
                tmp['ctan_name'] = ctan_name
                process_ls.append(tmp)
        return sum_info, process_ls
Esempio n. 10
0
    def __get_ctan_verbose_stats(self, name):
        # 连续获得参数
        def graceful_chain_get(d, *args, default=None):
            t = d
            for a in args:
                try:
                    t = t[a]
                except (KeyError, ValueError, TypeError, AttributeError):
                    return default
            return t

        # 计算cpu使用占比
        def calculate_cpu_percent2(d,
                                   previous_cpu_total=None,
                                   previous_cpu_system=None):
            cpu_percent = 0.0
            cpu_total = float(d["cpu_stats"]["cpu_usage"]["total_usage"])
            if previous_cpu_total is None:
                previous_cpu_total = cpu_total
            cpu_delta = cpu_total - previous_cpu_total
            cpu_system = float(d["cpu_stats"]["system_cpu_usage"])
            if previous_cpu_system is None:
                previous_cpu_system = cpu_system
            system_delta = cpu_system - previous_cpu_system
            online_cpus = d["cpu_stats"].get(
                "online_cpus",
                len(d["cpu_stats"]["cpu_usage"]["percpu_usage"]))
            if system_delta > 0.0:
                cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0
            return cpu_percent, cpu_total, cpu_system

        # 计算IO
        def calculate_blkio_bytes(d):
            """
            :param d:
            :return: (read_bytes, wrote_bytes), ints
            """
            bytes_stats = graceful_chain_get(d, "blkio_stats",
                                             "io_service_bytes_recursive")
            if not bytes_stats:
                return 0, 0
            r = 0
            w = 0
            for s in bytes_stats:
                if s["op"] == "Read":
                    r += s["value"]
                elif s["op"] == "Write":
                    w += s["value"]
            return r, w

        # 计算网络
        def calculate_network_bytes(d):
            """
            :param d:
            :return: (received_bytes, transceived_bytes), ints
            """
            networks = graceful_chain_get(d, "networks")
            if not networks:
                return 0, 0
            r = 0
            t = 0
            for if_name, data in networks.items():
                r += data["rx_bytes"]
                t += data["tx_bytes"]
            return r, t

        def calculate_mem_bytes(d):
            mem_limit = d['memory_stats']['limit']
            mem_usage = d['memory_stats']['usage']
            return mem_usage, mem_limit

        def parse_unit(val, scale=1000):
            unit_ls = ['B', 'KB', 'MB', 'GB']
            unit_lv = 0
            while val >= scale:
                val /= scale
                unit_lv += 1
                if unit_lv == len(unit_ls) - 1:
                    break
            return '{:.2f} {}'.format(val, unit_ls[unit_lv])

        if name not in self.user_stats_stream:
            # print('add {} into user_stats_stream'.format(name))
            ctan = self.containers.get(name)
            self.user_stats_stream[name] = ctan.stats(decode=True)

        # 通过数据流获取信息
        if self.containers.get(name).status == 'running':
            raw_stats = self.user_stats_stream[name].__next__()
            pre_cpu_stats = self.pre_cpu_stats[name]
        else:
            return None

        # cpu
        cpu_percent, cpu_total, cpu_system = calculate_cpu_percent2(
            raw_stats, pre_cpu_stats[0], pre_cpu_stats[1])
        self.pre_cpu_stats[name] = [cpu_total, cpu_system]  # 更新usage
        # blk
        read_blk, write_blk = calculate_blkio_bytes(raw_stats)
        # net
        read_net, write_net = calculate_network_bytes(raw_stats)
        # mem
        mem_usage, mem_limit = calculate_mem_bytes(raw_stats)

        # user gpu
        gpu_all_mem, gpu_used_mem, gpu_used_pcnt = 0, 0, 0
        gpu_num = nv.nvmlDeviceGetCount()
        for gpu_idx in range(gpu_num):
            h = nv.nvmlDeviceGetHandleByIndex(gpu_idx)
            running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h)
            for obj in running_process_obj_ls:
                process_pid = obj.pid
                process_raw_gpu_mem = obj.usedGpuMemory
                ctan_name = self.get_ctan_name_by_pid(process_pid)
                if ctan_name == name:
                    gpu_used_mem += process_raw_gpu_mem

            gpu_all_mem += nv.nvmlDeviceGetMemoryInfo(h).total

        ret_dt = {
            'id': raw_stats['id'],
            'pid': str(raw_stats['pids_stats']['current']),
            'cpu_percent': '{:.2f}'.format(cpu_percent),
            'read_blk': parse_unit(read_blk),
            'write_blk': parse_unit(write_blk),
            'read_net': parse_unit(read_net),
            'write_net': parse_unit(write_net),
            'mem_usage': parse_unit(mem_usage, scale=1024),
            'mem_limit': parse_unit(mem_limit, scale=1024),
            'mem_usage_pcnt': '{:.2f}'.format(mem_usage / mem_limit * 100),
            'gpu_mem_usage': parse_unit(gpu_used_mem, 1024),
            'gpu_mem_limit': parse_unit(gpu_all_mem, 1024),
            'gpu_mem_usage_pcnt':
            '{:.2f}'.format(gpu_used_mem / gpu_all_mem * 100)
        }

        return ret_dt
Esempio n. 11
0
    :return mem_used: used memory in MiB
    :return mem_total: total memory in MiB
    """
    if gpu_id is None:
        gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"])
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(int(gpu_id))
    mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    mem_used = mem_res.used / (1024**2)
    mem_total = mem_res.total / (1024**2)
    return mem_used, mem_total, gpu_id


def print_gpu_info(gpu_id=None):
    """ Print gpu-info regarding gpu_id on console
    :param gpu_id: gpu bus id
    """
    if gpu_id is None:
        gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"])
    mem_used, mem_total, gpu_id = get_gpu_info(gpu_id=int(gpu_id))
    print("GPU({}): {:.2f}MiB / {:.2f}MiB".format(gpu_id, mem_used, mem_total))


if __name__ == '__main__':
    eager_setup()

    x = tf.random.normal(shape=(100, 1000))

    for id in range(nvidia_smi.nvmlDeviceGetCount()):
        mem_used, mem_total, gpu_id = get_gpu_info(gpu_id=id)
        print("GPU({}): {:.2f}MiB / {:.2f}MiB".format(id, mem_used, mem_total))
Esempio n. 12
0
def check_gpu():
    try:
        nvidia_smi.nvmlInit()
        return nvidia_smi.nvmlDeviceGetCount() > 0
    except:
        return False
Esempio n. 13
0
 def __enter__(self):
     if not NvidiaSmi.init:
         nvidia_smi.nvmlInit()
         NvidiaSmi.total_devices = nvidia_smi.nvmlDeviceGetCount()
         NvidiaSmi.init = True
     return self