Beispiel #1
0
    def _get_gpu_usage():
        global enable_gpu_usage_check
        if gpustat is None or not enable_gpu_usage_check:
            return []
        gpu_utilizations = []
        gpus = []
        try:
            gpus = gpustat.new_query().gpus
        except Exception as e:
            logger.debug(f"gpustat failed to retrieve GPU information: {e}")

            # gpustat calls pynvml.nvmlInit()
            # On machines without GPUs, this can run subprocesses that spew to
            # stderr. Then with log_to_driver=True, we get log spew from every
            # single raylet. To avoid this, disable the GPU usage check on
            # certain errors.
            # https://github.com/ray-project/ray/issues/14305
            # https://github.com/ray-project/ray/pull/21686
            if type(e).__name__ == "NVMLError_DriverNotLoaded":
                enable_gpu_usage_check = False

        for gpu in gpus:
            # Note the keys in this dict have periods which throws
            # off javascript so we change .s to _s
            gpu_data = {
                "_".join(key.split(".")): val
                for key, val in gpu.entry.items()
            }
            gpu_utilizations.append(gpu_data)
        return gpu_utilizations
Beispiel #2
0
 def get_gpu_usage():
     if gpustat is None:
         return []
     gpu_utilizations = []
     gpus = []
     try:
         gpus = gpustat.new_query().gpus
     except Exception as e:
         logger.debug(f"gpustat failed to retrieve GPU information: {e}")
     for gpu in gpus:
         # Note the keys in this dict have periods which throws
         # off javascript so we change .s to _s
         gpu_data = {
             "_".join(key.split(".")): val
             for key, val in gpu.entry.items()
         }
         gpu_utilizations.append(gpu_data)
     return gpu_utilizations