Exemple #1
0
def get_free_gpus():
    """ For an N gpu system, returns a list of N boolean values. The nth value
    will be True if no process was running on the nth gpu."""
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warn(str_)
        return []

    numDevices = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * numDevices
    num_gpus = py3nvml.nvmlDeviceGetCount()
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) == 0:
            gpu_free[i] = True

    return gpu_free
Exemple #2
0
def test_nvidia():
    # pip install py3nvml
    import py3nvml
    from py3nvml import py3nvml as nvml

    inspect(py3nvml.get_free_gpus())

    nvml.nvmlInit()
    inspect(version=nvml.nvmlSystemGetDriverVersion())
    inspect(count=nvml.nvmlDeviceGetCount())

    for i in range(nvml.nvmlDeviceGetCount()):
        test_nvidia_device(i)

    nvml.nvmlShutdown()
Exemple #3
0
def getCUDAEnvironment():
    """ Get the CUDA runtime environment parameters (number of cards etc.). """

    rdict = dict()
    rdict['first_available_device_index'] = None
    rdict['device_count'] = 0

    try:
        nvml.nvmlInit()
        rdict['device_count'] = nvml.nvmlDeviceGetCount()

    except Exception:
        print(
            'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.'
        )
        return rdict

    for i in range(rdict['device_count']):
        memory_info = nvml.nvmlDeviceGetMemoryInfo(
            nvml.nvmlDeviceGetHandleByIndex(i))
        memory_usage_percentage = memory_info.used / memory_info.total

        if memory_usage_percentage <= 0.1:
            rdict['first_available_device_index'] = i
            break

    nvml.nvmlShutdown()

    return rdict
Exemple #4
0
    def get_gpu_info_by_nvml(self) -> Dict:
        """Get GPU info using nvml"""
        gpu_info_list = []
        driver_version = None
        try:
            nvmlInit()
            driver_version = nvmlSystemGetDriverVersion()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_info = {}
                gpu_info["memory_total"] = info.total
                gpu_info["memory_available"] = info.free
                gpu_info["name"] = nvmlDeviceGetName(handle)
                gpu_info_list.append(gpu_info)
            nvmlShutdown()
        except NVMLError as error:
            if not self.silent:
                self.logger.error(
                    "Error fetching GPU information using nvml: %s", error)
            return None

        result = {"driver_version": driver_version, "devices": gpu_info_list}

        if 'CUDA_VISIBLE_DEVICES' in environ:
            result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES']
        return result
Exemple #5
0
 def measure_gpu_usage(self):
     from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
                          nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
     max_gpu_usage = []
     gpu_name = []
     try:
         nvmlInit()
         deviceCount = nvmlDeviceGetCount()
         max_gpu_usage = [0 for i in range(deviceCount)]
         gpu_name = [
             nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
             for i in range(deviceCount)
         ]
         while True:
             for i in range(deviceCount):
                 info = nvmlDeviceGetMemoryInfo(
                     nvmlDeviceGetHandleByIndex(i))
                 max_gpu_usage[i] = max(max_gpu_usage[i],
                                        info.used / 1024**2)
             sleep(0.005)  # 5ms
             if not self.keep_measuring:
                 break
         nvmlShutdown()
         return [{
             "device_id": i,
             "name": gpu_name[i],
             "max_used_MB": max_gpu_usage[i]
         } for i in range(deviceCount)]
     except NVMLError as error:
         if not self.silent:
             self.logger.error(
                 "Error fetching GPU information using nvml: %s", error)
         return None
Exemple #6
0
    def get_device_count(self):
        """Get the compute capability of device.

        Return:
            count (int): count of device.
        """
        return nvml.nvmlDeviceGetCount()
 def read_top_card_memory_in_bytes():
     # pylint: disable=no-member
     # pylint incorrectly detects that function nvmlDeviceGetMemoryInfo returns str
     return self.__nvml_get_or_else(lambda: [
         nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index))
         .total for card_index in range(nvmlDeviceGetCount())
     ],
                                    default=0)
Exemple #8
0
def get_device_handles():
    """Get a list of NVML device handles, one per device.

    Can throw NVMLError.
    """
    return [
        pynvml.nvmlDeviceGetHandleByIndex(i)
        for i in range(pynvml.nvmlDeviceGetCount())
    ]
Exemple #9
0
 def gpus(self):
     out = []
     with nvml_manager():
         if not pynvml:
             return out
         cpu_to_node = self.cpu_nodes()
         n_devices = pynvml.nvmlDeviceGetCount()
         for i in range(n_devices):
             handle = pynvml.nvmlDeviceGetHandleByIndex(i)
             out.append(GPU(handle, cpu_to_node))
     return out
    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix
Exemple #11
0
def gpu_info():
    "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)"
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    info = []
    for i in range(0, deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        util = nvmlDeviceGetUtilizationRates(handle)
        desc = nvmlDeviceGetName(handle)
        info.append(
            (i, desc, util.gpu))  #['GPU %i - %s' % (i, desc)] = util.gpu
    return info
Exemple #12
0
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]:
    """
    Get driver version and list of ``GpuInfo``, if available.
    """
    try:
        nvml.nvmlInit()
    except nvml.NVMLError:
        # Not available.
        return None, None

    driver_version: str = nvml.nvmlSystemGetDriverVersion()
    gpus: List[GpuInfo] = []

    device_count: int = nvml.nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvml.nvmlDeviceGetHandleByIndex(i)
        name = try_get_info(nvml.nvmlDeviceGetName, handle)
        fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0)
        temp = try_get_info(
            lambda h: nvml.nvmlDeviceGetTemperature(h, nvml.
                                                    NVML_TEMPERATURE_GPU),
            handle,
            default=0,
        )
        mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
        if mem_info:
            mem_used = mem_info.used >> 20
            mem_total = mem_info.total >> 20
        else:
            mem_used = 0
            mem_total = 0
        util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
        if util:
            gpu_util = util.gpu
        else:
            gpu_util = 0
        gpus.append(
            GpuInfo(
                id=i,
                name=name,
                mem_usage=mem_used,
                mem_capacity=mem_total,
                utilization=gpu_util,
                temp=temp,
                fan=fan_speed,
            ))

    nvml.nvmlShutdown()

    return driver_version, gpus
Exemple #13
0
        def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]:
            from py3nvml.py3nvml import (
                NVMLError,
                nvmlDeviceGetCount,
                nvmlDeviceGetHandleByIndex,
                nvmlDeviceGetMemoryInfo,
                nvmlDeviceGetName,
                nvmlInit,
                nvmlShutdown,
            )

            max_gpu_usage = []
            gpu_name = []
            try:
                nvmlInit()
                device_count = nvmlDeviceGetCount()
                if not isinstance(device_count, int):
                    logger.error(
                        f"nvmlDeviceGetCount result is not integer: {device_count}"
                    )
                    return None

                max_gpu_usage = [0 for i in range(device_count)]
                gpu_name = [
                    nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
                    for i in range(device_count)
                ]
                while True:
                    for i in range(device_count):
                        info = nvmlDeviceGetMemoryInfo(
                            nvmlDeviceGetHandleByIndex(i))
                        if isinstance(info, str):
                            logger.error(
                                f"nvmlDeviceGetMemoryInfo returns str: {info}")
                            return None
                        max_gpu_usage[i] = max(max_gpu_usage[i],
                                               info.used / 1024**2)
                    sleep(0.005)  # 5ms
                    if not self.keep_measuring:
                        break
                nvmlShutdown()
                return [{
                    "device_id": i,
                    "name": gpu_name[i],
                    "max_used_MB": max_gpu_usage[i],
                } for i in range(device_count)]
            except NVMLError as error:
                logger.error("Error fetching GPU information using nvml: %s",
                             error)
                return None
Exemple #14
0
def run_gpu_mem_counter(do_shutdown=False):
    # Sum used memory for all GPUs
    if not torch.cuda.is_available(): return 0
    if do_shutdown:
        py3nvml.nvmlInit()
    devices = list(range(py3nvml.nvmlDeviceGetCount())
                   )  #if gpus_to_trace is None else gpus_to_trace
    gpu_mem = 0
    for i in devices:
        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
        gpu_mem += meminfo.used
    if do_shutdown:
        py3nvml.nvmlShutdown()
    return gpu_mem
    def __init__(self):
        self.labels = ['gpu', 'name', 'driver']
        self.driver = nv.nvmlSystemGetDriverVersion()

        self.n_gpu = nv.nvmlDeviceGetCount()
        self.hnds = [
            nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu)
        ]
        self.args = []
        for i, hnd in enumerate(self.hnds):
            args = OrderedDict()
            args['gpu'] = 'gpu%d' % i
            args['name'] = nv.nvmlDeviceGetName(hnd)
            args['driver'] = self.driver
            self.args.append(args)
Exemple #16
0
def get_free_gpus(max_procs=0):
    """
    Checks the number of processes running on your GPUs.

    Parameters
    ----------
    max_procs : int
        Maximum number of procs allowed to run on a gpu for it to be considered
        'available'

    Returns
    -------
    availabilities : list(bool)
        List of length N for an N-gpu system. The nth value will be true, if the
        nth gpu had at most max_procs processes running on it. Set to 0 to look
        for gpus with no procs on it.

    Note
    ----
    If function can't query the driver will return an empty list rather than raise an
    Exception.
    """
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warning(str_)
        return []

    num_gpus = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * num_gpus
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) <= max_procs:
            gpu_free[i] = True

    py3nvml.nvmlShutdown()
    return gpu_free
Exemple #17
0
 def _get_current_power(self, arrange_next=True):
     # return in kilo watt
     if self.gpu:
         num_gpus = nvmlDeviceGetCount()
         current_power = 0
         for i in range(num_gpus):
             h = nvmlDeviceGetHandleByIndex(i)
             power = try_get_info(nvmlDeviceGetPowerUsage, h, "-1")
             current_power += power / 1000
         if arrange_next:
             self.schedule.enter(self.interval, 1, self._get_current_power)
         else:
             pass
     else:
         current_power = 0
     self.powers.append(current_power)
     return current_power
Exemple #18
0
def gpustats():
    import py3nvml.py3nvml as pynvml

    if '__gpuhandler__' not in globals():
        globals()['__gpuhandler__'] = True
        pynvml.nvmlInit()

    usage = []
    util = []
    deviceCount = pynvml.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        usage.append(info.used / info.total)
        info = pynvml.nvmlDeviceGetUtilizationRates(handle)
        util.append(info.gpu / 100.)

    return {'maxmemusage': max(usage), 'maxutil': max(util)}
Exemple #19
0
def __get_gpu_temps():
    if utils.which('nvidia-smi') is not None:
        try:
            nvml.nvmlInit()
        except nvml.NVMLError as e:
            pass
        else:
            device_count = nvml.nvmlDeviceGetCount()
            print('\nGPU:')
            if device_count > 0:
                for i in range(device_count):
                    handle = nvml.nvmlDeviceGetHandleByIndex(i)
                    gpu_temp = nvml.nvmlDeviceGetTemperature(handle, 0)
                    print(' GPU %(i)s: ${alignr}${color %(color)s}%(temp)s${color}°C' % {
                        'i': i,
                        'color': get_gpu_temps_color(gpu_temp),
                        'temp': gpu_temp
                    })
            nvml.nvmlShutdown()
Exemple #20
0
def get_num_procs():
    """ Gets the number of processes running on each gpu

    Returns
    -------
    num_procs : list(int)
        Number of processes running on each gpu

    Note
    ----
    If function can't query the driver will return an empty list rather than raise an
    Exception.

    Note
    ----
    If function can't get the info from the gpu will return -1 in that gpu's place
    """
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warning(str_)
        return []

    num_gpus = py3nvml.nvmlDeviceGetCount()
    gpu_procs = [-1] * num_gpus
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue
        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        gpu_procs[i] = len(procs)

    py3nvml.nvmlShutdown()
    return gpu_procs
Exemple #21
0
def gpu_status():
    try:
        py3nvml.nvmlInit()
        device_count = py3nvml.nvmlDeviceGetCount()

        devices = []
        for i in range(device_count):
            gpu = {}
            handle = py3nvml.nvmlDeviceGetHandleByIndex(i)

            memory = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMemoryInfo, handle))
            if memory:
                memory = round(memory.total * 1.0 / 2**30, 2)

            gpu['name'] = _nmvl_call(partial(py3nvml.nvmlDeviceGetName,
                                             handle))
            gpu['clock'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetApplicationsClock, handle,
                        py3nvml.NVML_CLOCK_GRAPHICS))
            gpu['clock_mem'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetApplicationsClock, handle,
                        py3nvml.NVML_CLOCK_MEM))
            gpu['clock_max'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle,
                        py3nvml.NVML_CLOCK_GRAPHICS))
            gpu['clock_mem_max'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle,
                        py3nvml.NVML_CLOCK_MEM))
            gpu['memory'] = memory

            devices.append(gpu)
        nvidia = {
            'driver_version': py3nvml.nvmlSystemGetDriverVersion(),
            'devices': devices
        }

        return nvidia
    except Exception as e:
        return None
Exemple #22
0
    def get_device_procs(self, device_id: int) -> Optional[List[ProcInfo]]:
        """
        List processes running on the GPU.

        Parameters
        ----------
        device_id : int
            Device identifier

        Returns
        -------
        Optional[List[ProcInfo]]
            List of ProcInfo named tuples (name, pid, mem fields)

        Raises
        ------
        RuntimeError
            In case of py3nvml failure.
        """
        py3nvml.nvmlInit()
        dev_count = py3nvml.nvmlDeviceGetCount()  # type: int
        if not (0 <= device_id < dev_count):
            raise RuntimeError('Failed to query GPU with nvml')
        handle = py3nvml.nvmlDeviceGetHandleByIndex(device_id)
        result = []
        try:
            for proc in py3nvml.nvmlDeviceGetComputeRunningProcesses(handle):
                try:
                    name = str(py3nvml.nvmlSystemGetProcessName(proc.pid))
                except py3nvml.NVMLError as err:
                    if (err.value == py3nvml.NVML_ERROR_NOT_FOUND):
                        # exited?
                        continue
                    raise
                mem = proc.usedGpuMemory / 1024 / 1024
                result.append(ProcInfo(name, proc.pid, mem))
        finally:
            py3nvml.nvmlShutdown()

        return result
Exemple #23
0
def get_gpu_info() -> Optional[List[Dict[str, Any]]]:
    from py3nvml.py3nvml import (
        NVMLError,
        nvmlDeviceGetCount,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetMemoryInfo,
        nvmlDeviceGetName,
        nvmlInit,
        nvmlShutdown,
    )

    try:
        nvmlInit()
        result = []
        device_count = nvmlDeviceGetCount()
        if not isinstance(device_count, int):
            return None

        for i in range(device_count):
            info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
            if isinstance(info, str):
                return None
            result.append({
                "id":
                i,
                "name":
                nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)),
                "total":
                info.total,
                "free":
                info.free,
                "used":
                info.used,
            })
        nvmlShutdown()
        return result
    except NVMLError as error:
        print("Error fetching GPU information using nvml: %s", error)
        return None
    def _get_gpu_info():
        pynvml.nvmlInit()
        deviceCount = pynvml.nvmlDeviceGetCount()
        all_info = []
        for i in range(0, deviceCount):
            gpu = pynvml.nvmlDeviceGetHandleByIndex(i)

            dimensions = {}
            dimensions.update(Nvidia._get_driver_version())
            dimensions.update(Nvidia._get_device_uuid(gpu))
            dimensions.update(Nvidia._get_info_rom_image_version(gpu))
            dimensions.update(Nvidia._get_device_power_state(gpu))
            dimensions.update(Nvidia._get_device_vbios_version(gpu))

            measurements = {}
            measurements.update(Nvidia._get_fan_speed_percent(gpu))
            measurements.update(Nvidia._get_framebuffer_memory_stats(gpu))
            measurements.update(Nvidia._get_bar1_memory_stats(gpu))
            measurements.update(Nvidia._get_utilisation_stats(gpu))
            measurements.update(Nvidia._get_device_temperature(gpu))
            measurements.update(Nvidia._get_device_shutdown_temp(gpu))
            measurements.update(Nvidia._get_device_slowdown_temp(gpu))
            measurements.update(Nvidia._get_power_usage_watts(gpu))
            measurements.update(Nvidia._get_power_limit_watts(gpu))
            measurements.update(Nvidia._get_clock_info(gpu))
            measurements.update(Nvidia._get_clock_max_info(gpu))

            gpu_name = "{}_{}".format(
                Nvidia._get_device_name(gpu).get('name'),
                Nvidia._get_device_serial(gpu).get('serial'))
            gpu_info = {
                'name': gpu_name,
                'dimensions': dimensions,
                'measurements': measurements
            }
            all_info.append(gpu_info)
        pynvml.nvmlShutdown()
        return all_info
Exemple #25
0
def _torch_gpu_index_to_nvml_handle(index=None):
    """Convert the GPU index from torch to an NVML handle.

    With this function, we are sure to obtain the correct handle for the GPU
    used by pytorch.
    """
    if index is None:
        index = torch.cuda.current_device()

    device_count = nvmlDeviceGetCount()

    device_orders = os.environ.get(env_cuda_dev_order)
    if device_count > 1 and (device_orders is None
                             or device_orders != expected_dev_order):
        warnings.warn(
            "The environment variable {} should be set with value {}".format(
                env_cuda_dev_order, expected_dev_order))
        warnings.warn("GPU statistics can be wrong")

    devices_by_bus_id = []
    for nvml_device_index in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(nvml_device_index)
        pci_info = nvmlDeviceGetPciInfo(handle)
        devices_by_bus_id.append((pci_info.bus, handle))

    # sort by bus id and keep only the handles
    devices_by_bus_id = [dev[1] for dev in sorted(devices_by_bus_id)]

    visible_devices = os.environ.get(env_cuda_visible_devs)
    if visible_devices is None:
        available_device_handles = devices_by_bus_id
    else:
        available_device_handles = [
            devices_by_bus_id[int(d)] for d in visible_devices.split(",")
        ]

    return available_device_handles[index]
Exemple #26
0
def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=1.0):
    """
    Checks for gpu availability and sets CUDA_VISIBLE_DEVICES as such.

    Note that this function does not do anything to 'reserve' gpus, it only
    limits what GPUS your program can see by altering the CUDA_VISIBLE_DEVICES
    variable. Other programs can still come along and snatch your gpu. This
    function is more about preventing **you** from stealing someone else's GPU.

    If more than 1 GPU is requested but the full amount are available, then it
    will set the CUDA_VISIBLE_DEVICES variable to see all the available GPUs.
    A warning is generated in this case.

    If one or more GPUs were requested and none were available, a Warning
    will be raised. Before raising it, the CUDA_VISIBLE_DEVICES will be set to a
    blank string. This means the calling function can ignore this warning and
    proceed if it chooses to only use the CPU, and it should still be protected
    against putting processes on a busy GPU.

    You can call this function with num_gpus=0 to blank out the
    CUDA_VISIBLE_DEVICES environment variable.

    Parameters
    ----------
    num_gpus : int
        How many gpus your job needs (optional)
    gpu_select : iterable
        A single int or an iterable of ints indicating gpu numbers to
        search through.  If left blank, will search through all gpus.
    gpu_fraction : float
        The fractional of a gpu memory that must be free for the script to see
        the gpu as free. Defaults to 1. Useful if someone has grabbed a tiny
        amount of memory on a gpu but isn't using it.

    Returns
    -------
    success : int
        Number of gpus 'grabbed'

    Raises
    ------
    RuntimeWarning
        If couldn't connect with NVIDIA drivers.
        If 1 or more gpus were requested and none were available.
    ValueError
        If the gpu_select option was not understood (can fix by leaving this
        field blank, providing an int or an iterable of ints).
    """
    # Set the visible devices to blank.
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    if num_gpus == 0:
        return 0

    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.
                  Proceeding on cpu only..."""
        warnings.warn(str_, RuntimeWarning)
        logger.warn(str_)
        return 0

    numDevices = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * numDevices

    # Flag which gpus we can check
    if gpu_select is None:
        gpu_check = [True] * 8
    else:
        gpu_check = [False] * 8
        try:
            gpu_check[gpu_select] = True
        except TypeError:
            try:
                for i in gpu_select:
                    gpu_check[i] = True
            except:
                raise ValueError(
                    '''Please provide an int or an iterable of ints
                    for gpu_select''')

    # Print out GPU device info. Useful for debugging.
    for i in range(numDevices):
        # If the gpu was specified, examine it
        if not gpu_check[i]:
            continue

        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
        info = py3nvml.nvmlDeviceGetMemoryInfo(handle)

        str_ = "GPU {}:\t".format(i) + \
               "Used Mem: {:>6}MB\t".format(info.used/(1024*1024)) + \
               "Total Mem: {:>6}MB".format(info.total/(1024*1024))
        logger.debug(str_)

    # Now check if any devices are suitable
    for i in range(numDevices):
        # If the gpu was specified, examine it
        if not gpu_check[i]:
            continue

        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
        info = py3nvml.nvmlDeviceGetMemoryInfo(handle)

        # Sometimes GPU has a few MB used when it is actually free
        if (info.free + 10) / info.total >= gpu_fraction:
            gpu_free[i] = True
        else:
            logger.info('GPU {} has processes on it. Skipping.'.format(i))

    py3nvml.nvmlShutdown()

    # Now check whether we can create the session
    if sum(gpu_free) == 0:
        warnings.warn("Could not find enough GPUs for your job",
                      RuntimeWarning)
        logger.warn(str_)
        return 0
    else:
        if sum(gpu_free) >= num_gpus:
            # only use the first num_gpus gpus. Hide the rest from greedy
            # tensorflow
            available_gpus = [i for i, x in enumerate(gpu_free) if x]
            use_gpus = ','.join(list(
                str(s) for s in available_gpus[:num_gpus]))
            logger.debug('{} Gpus found free'.format(sum(gpu_free)))
            logger.info('Using {}'.format(use_gpus))
            os.environ['CUDA_VISIBLE_DEVICES'] = use_gpus
            return num_gpus
        else:
            # use everything we can.
            s = "Only {} GPUs found but {}".format(sum(gpu_free), num_gpus) + \
                "requested. Allocating these and continuing."
            warnings.warn(s, RuntimeWarning)
            logger.warn(s)
            available_gpus = [i for i, x in enumerate(gpu_free) if x]
            use_gpus = ','.join(list(str(s) for s in available_gpus))
            logger.debug('{} Gpus found free'.format(sum(gpu_free)))
            logger.info('Using {}'.format(use_gpus))
            os.environ['CUDA_VISIBLE_DEVICES'] = use_gpus
            return sum(gpu_free)
def start_memory_tracing(
    modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
    modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
    events_to_trace: str = "line",
    gpus_to_trace: Optional[List[int]] = None,
) -> MemoryTrace:
    """
    Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `./benchmark.py` for
    usage examples. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident
    Set Size” (the non-swapped physical memory the process is using). See
    https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info

    Args:

        - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list
          of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or
          'transformers.modeling_gpt2')
        - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list
          of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
        - `events_to_trace`: string or list of string of events to be recorded (see official python doc for
          `sys.settrace` for the list of events) default to line
        - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs

    Return:

        - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).

            - `UsedMemoryState` are named tuples with the following fields:

                - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current
                  file, location in current file)
                - 'cpu_memory': CPU RSS memory state *before* executing the line
                - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only
                  `gpus_to_trace` if provided)

    `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following
    fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module
    currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that
    triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script

    """
    if is_psutil_available():
        process = psutil.Process(os.getpid())
    else:
        logger.warning(
            "Psutil not installed, we won't log CPU memory usage. "
            "Install psutil (pip install psutil) to use CPU memory tracing.")
        process = None

    if is_py3nvml_available():
        try:
            nvml.nvmlInit()
            devices = list(range(nvml.nvmlDeviceGetCount())
                           ) if gpus_to_trace is None else gpus_to_trace
            nvml.nvmlShutdown()
        except (OSError, nvml.NVMLError):
            logger.warning("Error while initializing comunication with GPU. "
                           "We won't perform GPU memory tracing.")
            log_gpu = False
        else:
            log_gpu = is_torch_available() or is_tf_available()
    else:
        logger.warning(
            "py3nvml not installed, we won't log GPU memory usage. "
            "Install py3nvml (pip install py3nvml) to use GPU memory tracing.")
        log_gpu = False

    memory_trace = []

    def traceit(frame, event, args):
        """
        Tracing method executed before running each line in a module or sub-module Record memory allocated in a list
        with debugging information
        """
        global _is_memory_tracing_enabled

        if not _is_memory_tracing_enabled:
            return traceit

        # Filter events
        if events_to_trace is not None:
            if isinstance(events_to_trace, str) and event != events_to_trace:
                return traceit
            elif isinstance(events_to_trace,
                            (list, tuple)) and event not in events_to_trace:
                return traceit

        if "__name__" not in frame.f_globals:
            return traceit

        # Filter modules
        name = frame.f_globals["__name__"]
        if not isinstance(name, str):
            return traceit
        else:
            # Filter whitelist of modules to trace
            if modules_to_trace is not None:
                if isinstance(modules_to_trace,
                              str) and modules_to_trace not in name:
                    return traceit
                elif isinstance(modules_to_trace, (list, tuple)) and all(
                        m not in name for m in modules_to_trace):
                    return traceit

            # Filter blacklist of modules not to trace
            if modules_not_to_trace is not None:
                if isinstance(modules_not_to_trace,
                              str) and modules_not_to_trace in name:
                    return traceit
                elif isinstance(modules_not_to_trace, (list, tuple)) and any(
                        m in name for m in modules_not_to_trace):
                    return traceit

        # Record current tracing state (file, location in file...)
        lineno = frame.f_lineno
        filename = frame.f_globals["__file__"]
        if filename.endswith(".pyc") or filename.endswith(".pyo"):
            filename = filename[:-1]
        line = linecache.getline(filename, lineno).rstrip()
        traced_state = Frame(filename, name, lineno, event, line)

        # Record current memory state (rss memory) and compute difference with previous memory state
        cpu_mem = 0
        if process is not None:
            mem = process.memory_info()
            cpu_mem = mem.rss

        gpu_mem = 0
        if log_gpu:
            # Clear GPU caches
            if is_torch_available():
                torch_empty_cache()
            if is_tf_available():
                tf_context.context()._clear_caches(
                )  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802

            # Sum used memory for all GPUs
            nvml.nvmlInit()

            for i in devices:
                handle = nvml.nvmlDeviceGetHandleByIndex(i)
                meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
                gpu_mem += meminfo.used

            nvml.nvmlShutdown()

        mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
        memory_trace.append(mem_state)

        return traceit

    sys.settrace(traceit)

    global _is_memory_tracing_enabled
    _is_memory_tracing_enabled = True

    return memory_trace
def new_query():
    """Query the information of all the GPUs on local machine"""

    N.nvmlInit()

    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""

        def get_process_info(nv_process):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=nv_process.pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except:
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except:
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        index = N.nvmlDeviceGetIndex(handle)
        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'temperature.gpu': temperature,
            'utilization.gpu': utilization.gpu if utilization else None,
            'power.draw': int(power / 1000) if power is not None else None,
            'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None,
            # Convert bytes into MBytes
            'memory.used': int(memory.used / 1024 / 1024) if memory else None,
            'memory.total': int(memory.total / 1024 / 1024) if memory else None,
            'processes': processes,
        }
        return gpu_info

    # 1. get the list of gpu and status
    gpu_list = {}
    device_count = N.nvmlDeviceGetCount()

    for index in range(device_count):
        handle = N.nvmlDeviceGetHandleByIndex(index)
        gpu_info = get_gpu_info(handle)
        # gpu_stat = GPUStat(gpu_info)
        gpu_list[index] = gpu_info

    N.nvmlShutdown()
    return gpu_list
Exemple #29
0
    def get_stats(self):
        """
        Get system statistics and assign to `self`
        """
        memory_usage = psutil.virtual_memory()
        disk_usage = psutil.disk_usage('/')
        # net = psutil.net_io_counters()
        system = {
            # CPU utilization percent(can be over 100%)
            'cpu':
            round10e5(self._process.cpu_percent(0.0)),

            # Whole system memory usage
            # 'memory_used': round10e5(memory_usage.used / 1024 / 1024),
            'memory_percent':
            round10e5(memory_usage.used * 100 / memory_usage.total),

            # Get the portion of memory occupied by a process
            # 'p_memory_rss': round10e5(self._process.memory_info().rss
            #                           / 1024 / 1024),
            'p_memory_percent':
            round10e5(self._process.memory_percent()),

            # Disk usage
            # 'disk_used': round10e5(disk_usage.used / 1024 / 1024),
            'disk_percent':
            round10e5(disk_usage.percent),
        }

        # Collect GPU statistics
        gpus = []
        try:
            gpu_device_count = nvml.nvmlDeviceGetCount()
            for i in range(gpu_device_count):
                handle = nvml.nvmlDeviceGetHandleByIndex(i)
                nvml_tmp = nvml.NVML_TEMPERATURE_GPU

                # Get device memory and temperature
                util = nvml.nvmlDeviceGetUtilizationRates(handle)
                memory = nvml.nvmlDeviceGetMemoryInfo(handle)
                temp = nvml.nvmlDeviceGetTemperature(handle, nvml_tmp)

                # Compute power usage in watts and percent
                power_watts = nvml.nvmlDeviceGetPowerUsage(handle) / 1000
                power_cap = nvml.nvmlDeviceGetEnforcedPowerLimit(handle)
                power_cap_watts = power_cap / 1000
                power_watts / power_cap_watts * 100

                gpus.append({
                    # GPU utilization percent
                    'gpu':
                    round10e5(util.gpu),

                    # Device memory usage
                    # 'memory_used': round10e5(memory.used / 1024 / 1024),
                    'gpu_memory_percent':
                    round10e5(memory.used * 100 / memory.total),

                    # Power usage in watts and percent
                    'gpu_power_watts':
                    round10e5(power_watts),
                    # 'power_percent': round10e5(power_usage),

                    # Device temperature
                    'gpu_temp':
                    round10e5(temp),
                })
        except Exception:
            pass

        return system, gpus
Exemple #30
0
 def __init__(self):
     num_gpus = py3nvml.nvmlDeviceGetCount()
     self.gpus = [GpuInfo(i) for i in range(num_gpus)]