def get_computation_devices( preferred_gpu_list: Optional[List[int]], multi_gpu_flag: bool, ) -> List[Device]: """get the available computation devices (CPU & GPUs) Get the computation devices for deep learning experiments with given preferred list of GPU and flag for multi-GPU computation. :param preferred_gpu_list: preferred list of GPUs represented with integers starting from 0. For instances, [0, 2] represents th first and the third GPUs. None or empty list of GPUS indicate the usage of CPU :param multi_gpu_flag: boolean flag for multi-GPU training and testing :return: list of integers representing the available devices (CPU & GPUs) """ # use CPU when GPUs are not preferred or not available if (preferred_gpu_list is None) \ or (len(preferred_gpu_list) == 0) \ or (not torch.cuda.is_available()): return [ Device('cpu'), ] # else GPUs are preferred and available # get all available GPU indexes _available_gpu_list: List[int] if getAvailable: # by default, use GPU utility package with load and memory usage # specification so that the 'available' GPUs are actually ready # for deep learning runs (https://github.com/anderskm/gputil) _available_gpu_list = getAvailable( limit=_MAX_NUM_GPUS, maxLoad=_MAX_GPU_LOAD, maxMemory=_MAX_GPU_MEM_USED, ) else: # assume all GPUs are good to use without GPUtil package _available_gpu_list = list(range(torch.cuda.device_count())) _warning_msg = \ f'GPUtil (https://github.com/anderskm/gputil) not installed.' \ f'Assuming all GPUs ({_available_gpu_list}) are available ' \ f'and ready for training ... ' _LOGGER.warning(_warning_msg) # get the overlap between the preferred and the available GPUs _gpus = \ [_g for _g in _available_gpu_list if _g in preferred_gpu_list] # use CPU if there is no preferred GPUs that are available if len(_gpus) == 0: return [ Device('cpu'), ] # otherwise return one or all GPUs depending on the multi-GPU flag return [Device(f'cuda:{_g}') for _g in _gpus] \ if multi_gpu_flag else [Device(f'cuda:{_gpus[0]}'), ]
def auto_gpus_select(maxGpuNum): from GPUtil import getAvailable from time import sleep while True: gpu_idxs = getAvailable(order='memory', limit=maxGpuNum, maxLoad=0.2, maxMemory=0.2, includeNan=False) if len(gpu_idxs): break else: print("No available GPU! Wait for other users to exit...") sleep(10) return gpu_idxs
def get_gpus(): return getAvailable(order="random", limit=8, maxLoad=10**-6, maxMemory=10**-2)
def get_computation_devices( preferred_gpu_list: Optional[Union[List[int], str]], multi_gpu_flag: bool, ) -> List[Device]: """get the available computation devices (CPU & GPUs) Get the computation devices for deep learning experiments with given preferred list of GPU and flag for multi-GPU computation. :param preferred_gpu_list: preferred list of GPUs represented with integers starting from 0; e.g. [0, 2] represents th first and the third GPUs; none or empty list of GPUS indicate the usage of CPU; and 'all' indicates all GPUs are preferred :type preferred_gpu_list: Optional[Union[List[int], str]] :param multi_gpu_flag: boolean flag for multi-GPU training and testing :type multi_gpu_flag: bool :return: list of available computation devices (CPU & GPUs) visible to PyTorch (not hardware indices from PCIe) :rtype: List[Device] """ # use CPU when GPUs are not preferred or not available if (preferred_gpu_list is None) \ or (len(preferred_gpu_list) == 0) \ or (not torch.cuda.is_available()): return [Device('cpu'), ] # else GPUs are preferred and available # get all available GPU indexes _available_gpu_list: List[int] if getAvailable: # by default, use GPU utility package with load and memory usage # specification so that the 'available' GPUs are actually ready # for deep learning runs (https://github.com/anderskm/gputil) _available_gpu_list = getAvailable( limit=_MAX_NUM_GPUS, maxLoad=_MAX_GPU_LOAD, maxMemory=_MAX_GPU_MEM_USED, ) else: # assume all GPUs are good to use without GPUtil package _available_gpu_list = list(range(torch.cuda.device_count())) _warning_msg = \ f'GPUtil (https://github.com/anderskm/gputil) not installed.' \ f'Assuming all GPUs ({_available_gpu_list}) are available ' \ f'and ready for training ... ' _LOGGER.warning(_warning_msg) # if CUDA_VISIBLE_DEVICES is set as an environment variable, then make # sure that the list of available GPUs are visible, and re-index them in # the way that PyTorch can access. For example: # CUDA_VISIBLE_DEVICES = [4, 5, 6, 7] # GPUs not in use = [1, 4, 7] # available GPUs = [4, 7] # available GPUs for PyTorch = [0, 3] if 'CUDA_VISIBLE_DEVICES' in os.environ: _available_gpu_list_: List[int] = [] _cuda_visible_devices: List[int] = \ [int(_i) for _i in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] for __available_gpu_id in _available_gpu_list: if __available_gpu_id in _cuda_visible_devices: _available_gpu_list_.append( _cuda_visible_devices.index(__available_gpu_id)) _available_gpu_list = _available_gpu_list_ # double check to make sure that all GPUs are accessible for PyTorch _available_gpu_list_: List[int] = [] for __available_gpu_id in _available_gpu_list: try: torch.cuda.get_device_properties(__available_gpu_id) except Exception: _warning_msg = \ f'CUDA device {__available_gpu_id}, despite in the range ' \ f'of PyTorch GPU count, is not available.' _LOGGER.warning(_warning_msg) print(_warning_msg) else: _available_gpu_list_.append(__available_gpu_id) _available_gpu_list = _available_gpu_list_ # get the overlap between the preferred and the available GPUs if isinstance(preferred_gpu_list, str) and preferred_gpu_list == 'all': _gpus = _available_gpu_list elif isinstance(preferred_gpu_list, list): _gpus = list(set(_available_gpu_list).intersection(preferred_gpu_list)) else: _error_msg = \ f'Unknown parameter {preferred_gpu_list} for the list of ' \ f'preferred GPUs. Must be either \"all\" or list of integers.' raise ValueError(_error_msg) # use CPU if there is no preferred GPUs that are available if len(_gpus) == 0: return [Device('cpu'), ] # otherwise return one or all GPUs depending on the multi-GPU flag return [Device(f'cuda:{_g}') for _g in _gpus] \ if multi_gpu_flag else [Device(f'cuda:{_gpus[0]}'), ]
def get_gpus(): return [ str(idx) for idx in getAvailable(limit=8, maxLoad=10**-1, maxMemory=10**-1) ]