def get_dev(n=1, ok=(0, 1, 2, 3, 4, 5, 6, 7)): import GPUtil, time print('Auto select gpu') GPUtil.showUtilization() def _limit(devs, ok): return [dev for dev in devs if dev in ok] devs = GPUtil.getAvailable(order='memory', maxLoad=0.5, maxMemory=0.5, limit=n) # devs = _limit(devs, ok) if len(devs) >= 1: print('available {}'.format(devs)) # GPUtil.showUtilization() return devs[0] if n == 1 else devs while len(devs) == 0: devs = GPUtil.getAvailable(order='random', maxLoad=0.98, maxMemory=0.98, limit=n) devs = _limit(devs, ok) if len(devs) >= 1: print('available {}'.format(devs)) GPUtil.showUtilization() return devs[0] if n == 1 else devs print('no device avelaible') GPUtil.showUtilization() time.sleep(60) # 60 * 3
def get_dev(n=1, ok=(0, 1, 2, 3, 4, 5, 6, 7), mem=(0.5, 0.9), sleep=60): import GPUtil, time logging.info('Auto select gpu') GPUtil.showUtilization() def _limit(devs, ok): return [int(dev) for dev in devs if dev in ok] devs = GPUtil.getAvailable(order='memory', maxLoad=1, maxMemory=mem[0], limit=n) # devs = _limit(devs, ok) if len(devs) >= 1: logging.info('available {}'.format(devs)) # GPUtil.showUtilization() return int(devs[0]) if n == 1 else devs while len(devs) == 0: devs = GPUtil.getAvailable(order='random', maxLoad=1, maxMemory=mem[1], limit=n) devs = _limit(devs, ok) if len(devs) >= 1: logging.info('available {}'.format(devs)) GPUtil.showUtilization() return devs[0] if n == 1 else devs logging.info('no device avelaible') GPUtil.showUtilization() time.sleep(sleep)
def check_configs(): if cfg.MODE in ('train', ): cfg.TEST.USE_SAVED_PRED_RES = 'none' elif cfg.MODE in ('vis', ): cfg.TEST.EVAL_SEG_TAG_ON_GT = False cfg.LOG_IN_FILE = False elif cfg.MODE in ('demo', 'batch'): cfg.TEST.USE_SAVED_PRED_RES = 'none' cfg.TEST.EVAL_SEG_TAG_ON_GT = False scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES assert scales == cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES if cfg.MODEL.BACKBONE.FEATURE_UPSAMPLE: assert len(scales) == 1 and scales[0] == 1. / 2**( cfg.MODEL.BACKBONE.FEATURE_UPSAMPLE_LEVEL - 1) anchor = cfg.MODEL.RPN.ANCHOR_STRIDE assert len(anchor) == 1 and anchor[0] == 1. / scales[0] if not cfg.MODEL.USE_3D_FUSION: assert cfg.INPUT.NUM_IMAGES_3DCE == 1 assert cfg.MODEL.BACKBONE.FEATURE_FUSION_LEVELS == [False] * 3 if cfg.GPU == '': import GPUtil deviceIDs = GPUtil.getAvailable(order='lowest', limit=1, maxMemory=.2) if len(deviceIDs) == 0: deviceIDs = GPUtil.getAvailable(order='lowest', limit=1, maxMemory=.9, maxLoad=1) cfg.GPU = str(deviceIDs[0])
def allocate_GPU(gpu_num): device_list = GPUtil.getAvailable(limit=gpu_num) while len(device_list)!=gpu_num: print('Cannot allocate GPU. Waiting...') time.sleep(60) device_list = GPUtil.getAvailable(limit=gpu_num) devices = '' for device in device_list: devices += str(device) + ',' os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = devices
def Assign_GPU(): excluded_IDs = [2] GPU_2_use = GPUtil.getAvailable(order='memory', excludeID=excluded_IDs) if len(GPU_2_use) == 0: print('No available GPUs. waiting...') while len(GPU_2_use) == 0: time.sleep(10) GPU_2_use = GPUtil.getAvailable(order='memory', excludeID=excluded_IDs) print('Using GPU #%d' % (GPU_2_use[0])) return GPU_2_use
def Assign_GPU(max_GPUs=1,**kwargs): excluded_IDs = [] GPU_2_use = GPUtil.getAvailable(order='memory',excludeID=excluded_IDs,limit=max_GPUs if max_GPUs is not None else 100,**kwargs) if len(GPU_2_use)==0: print('No available GPUs. waiting...') while len(GPU_2_use)==0: time.sleep(10) GPU_2_use = GPUtil.getAvailable(order='memory', excludeID=excluded_IDs) assert len(GPU_2_use)>0,'No available GPUs...' if max_GPUs is not None: print('Using GPU #%d'%(GPU_2_use[0])) os.environ["CUDA_VISIBLE_DEVICES"] = "%d"%(GPU_2_use[0]) # Limit to 1 GPU when using an interactive session return [GPU_2_use[0]] else: return GPU_2_use
def getAvailableGPU(maxload, maxmem, check_docker=True): ### firsly, get available gpus by resource usage availableIDs = GPUtil.getAvailable(order='first', limit=8, maxLoad=maxload, maxMemory=maxmem) if len(availableIDs) < 1: return None elif check_docker: ### check available gpus by docker tmp_ids = os.popen( "docker inspect $(docker ps -q)|grep NVIDIA_VISIBLE_DEVICES").read( ).replace("NVIDIA_VISIBLE_DEVICES=", "").replace('''"''', "").split() print tmp_ids try: invalid_gpus = map( lambda x: int(x), filter( lambda x: x != '' and x != 'all', list( set( reduce(lambda x, y: x + y, map(lambda x: x.split(','), tmp_ids)))))) except: invalid_gpus = [] print invalid_gpus final_availableIDs = filter(lambda x: x not in invalid_gpus, availableIDs) print final_availableIDs if len(final_availableIDs) > 0: return final_availableIDs.pop() return None else: return availableIDs.pop()
def runTask(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--maxGPU', type=int, default=1000) parser.add_argument('--needGPU', type=int, default=1) parser.add_argument('--maxLoad', type=float, default=0.1) parser.add_argument('--maxMemory', type=float, default=0.1) parser.add_argument('--sleeptime', type=float, default=60) parser.add_argument('--user', type=str) parser.add_argument('file', nargs=1) args = parser.parse_args() import cPickle from subprocess import Popen, PIPE import time import GPUtil import random import os maxGPU = args.maxGPU needGPU = args.needGPU maxLoad = args.maxLoad maxMemory = args.maxMemory file = args.file[0] user = args.user sleeptime = args.sleeptime while True: with open(file) as f: lines = [line for line in f if line.strip()] if lines: while True: s = 'for x in $(nvidia-smi --query-compute-apps=pid --format=csv,noheader,nounits); do ps -f -p $x | grep "%s"; done' % user p = Popen(s, stdout=PIPE, shell=True) ans = p.stdout.read() mygpu = len(ans.splitlines()) deviceIDs = GPUtil.getAvailable(order='first', limit=needGPU, maxLoad=maxLoad, maxMemory=maxMemory, includeNan=False, excludeID=[], excludeUUID=[]) find = False if mygpu < maxGPU and len(deviceIDs) >= needGPU: os.system(lines[0].strip()) print('runing command(%s)' % lines[0].strip()) find = True time.sleep(sleeptime) if find: break with open(file, 'w') as f: for line in lines[1:]: f.write(line) else: break
def configure_tf_devices(visible_ids=None): # Do nothing if no visible GPU IDs if not visible_ids or visible_ids[0] == -1: return try: deviceIDs = GPUtil.getAvailable(order='load', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) except ValueError: cprint(NO_NVIDIA_GPUS, 'yellow') return deviceIDs = [id_ for id_ in deviceIDs if id_ in visible_ids] if not deviceIDs: cprint(NO_NVIDIA_GPUS, 'yellow') return if not deviceIDs: cprint( "Error: Currently, no GPU is eligible (available memory and load at <=50%)", "red") GPUtil.showUtilization() else: cprint( "GPUs with utilization and memory load <50%: {}".format(', '.join( [str(x) for x in deviceIDs])), "green") return deviceIDs
def set_gpus(n_gpus): """ Find GPUs to use, if possible. Return TF config """ gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True try: device_IDs = GPUtil.getAvailable(order='load', limit=n_gpus) except FileNotFoundError: print('\n---- No GPUs on this machine ----\n') return (gpu_config, 0) if len(device_IDs) > 0: str_device_list = ','.join([str(x) for x in device_IDs]) gpu_config.gpu_options.visible_device_list = str_device_list if len(device_IDs) < n_gpus: print('\n**** Note: {0} GPUs requested, but only {1} found ****'. format(n_gpus, len(device_IDs))) print('\n---- Running on GPU(s) {} ----\n'.format(str_device_list)) else: print('\n---- No GPUs available! ----\n') return (gpu_config, len(device_IDs))
def _check_params(self): self._np = np self._cupyx = None self.loglikelihood_ = -np.inf self.trained_successfully_ = False if self.use_gpu and (not _CUPY_INSTALLED or not _DEFAULT_USE_GPU or not cupy.cuda.is_available()): self.gpu_number = None self.use_gpu = False logger.warning( "GPU not used as cupy library seems not to be installed or CUDA is not available" ) if (self.use_gpu and _CUPY_INSTALLED and _DEFAULT_USE_GPU and cupy.cuda.is_available()): if self.gpu_index != None: cupy.cuda.Device(self.gpu_index).use() self._np = cupy self._cupyx = cupyx else: free_idx = GPUtil.getAvailable("memory", limit=10) if not free_idx: self.use_gpu = False logger.warning("GPU not used as no gpu is free") else: self._np = cupy self._cupyx = cupyx gpu_number = free_idx[0] cupy.cuda.Device(gpu_number).use()
def get_gpu_info(self) -> Dict[str, str]: try: gpus: List[GPU] = GPUtil.getGPUs() available_gpus: List = GPUtil.getAvailable(order='memory', limit=10, maxLoad=0.4, maxMemory=0.4, includeNan=False, excludeID=[], excludeUUID=[]) available_gpus: List = list( filter(lambda gpu: gpu.id in available_gpus, gpus)) # return -1 if no nvidia-smi visible else return available gpus # {"id":"gpu name"} if len(gpus) == 0: return {"-1": "CPU"} else: gpus_dict: Dict[str, str] = {} for gpu in available_gpus: gpus_dict[str( gpu.id)] = "GPU " + str(gpu.id) + " - " + str( gpu.name) + "- Available Memory: " + str( int(gpu.memoryFree)) + "MB/" + str( int(gpu.memoryTotal)) + "MB" return gpus_dict except Exception as e: raise GpuInfoInvalid(e._str_())
def get_device(): """ Get one gpu id that have the most available memory. Returns: (int, str): The gpu id (None if no available gpu) and the the device string (pytorch style). """ gpu_id_list = GPUtil.getAvailable( order="memory", limit=3 ) # get the fist gpu with the lowest load if len(gpu_id_list) < 1: gpu_id = None device_str = "cpu" else: gpu_id = gpu_id_list[0] # need to set 0 if ray only specify 1 gpu if "CUDA_VISIBLE_DEVICES" in os.environ: if len(os.environ["CUDA_VISIBLE_DEVICES"].split()) == 1: # gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) gpu_id = 0 print("Find only one gpu with id: ", gpu_id) device_str = "cuda:" + str(gpu_id) # print(os.system("nvidia-smi")) else: print("Get a gpu id list sorted by the most available memory:", gpu_id) device_str = "cuda:" + str(gpu_id) return gpu_id, device_str
def set_gpu(): gpu = GPUtil.getAvailable('last', limit=5, excludeID=[0, 1]) vis_gpu = "" for g in gpu: vis_gpu += ", " + str(g) vis_gpu = vis_gpu[1:] os.environ["CUDA_VISIBLE_DEVICES"] = vis_gpu
def __init__(self, video_dir, cache_dir=None, stage_n=None, n_gpu=4, batch_size=4, stride=1): available_gpus = GPUtil.getAvailable(limit=n_gpu, maxLoad=0.2, maxMemory=0.2) n_available_gpu = len(available_gpus) stages = [ LoaderStage(), DetectorStage(available_gpus, 2), TrackerStage(), MinotorStage(cache_dir is not None) ] stages = stages[:stage_n] super().__init__(stages, video_dir, cache_dir, batch_size, stride) if n_available_gpu == 0: self.logger.warn('No gpus available, running on cpu') elif n_available_gpu < n_gpu: self.logger.warn( '%d gpus requested, but only %d gpus (gpu id: %s) available', n_gpu, n_available_gpu, available_gpus) else: self.logger.info('Running on %d gpus (gpu id: %s)', n_gpu, available_gpus) if isinstance(stages[-1], (DetectorStage.func)) and n_gpu > 1: self.logger.warn( 'Last stage is %s with %d gpus, ' 'results may be out of order and incomplete', stages[-1].__class__.__name__, n_gpu) self.videos_processed = [] self.events = []
def generate_jobs(self, n, wait_seconds=20): for i in range(n): params = { name: spec.sample() for name, spec in self.params.items() } if self.name is not None: ids = ['{}={}'.format(k, params[k]) for k in self.keys] params[self.name] = hashlib.sha256( ','.join(ids).encode()).hexdigest() gpu_prefix = '' if self.gpu is not None: while True: available = GPUtil.getAvailable(order='first', limit=1, maxMemory=0.01) if available: break else: time.sleep(wait_seconds) params['gpu'] = 0 gpu_prefix = 'CUDA_VISIBLE_DEVICES={} '.format(available[0]) command = '{}{} '.format(gpu_prefix, self.executable) specs = [] for k in sorted(list(params.keys())): v = params[k] specs.append('--{} {}'.format(k, v)) command += ' '.join(specs) yield command, params
def compile_sequential_model(): # Compiles and Trains neural network data = request.json notebook = get_notebook_data(data['notebook_name']) notebook['hyperparameters'] = data['hyperparameters'] notebook["history"] = { "acc": [], "val_acc": [], "loss": [], "val_loss": [] } # allocate specified device while creating notebook config = tensorflow.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = ( notebook["GPU_count"] / len(GPUtil.getAvailable())) keras.backend.tensorflow_backend.set_session( tensorflow.Session(config=config)) notebook['is_online'] = True # load created model model = keras.models.model_from_json(notebook['model']) # compile with client-sent hyperparamters model.compile(loss=data['hyperparameters']['loss'], optimizer=keras.optimizers.SGD( lr=float(data['hyperparameters']['learning_rate']), momentum=float(data['hyperparameters']['momentum']), nesterov=bool(data['hyperparameters']['nesterov'])), metrics=['acc']) # Training starts model.fit(x=notebook['x_train'], y=notebook['y_train'], batch_size=128, validation_data=(notebook['x_test'], notebook['y_test']), epochs=int(data['hyperparameters']['epochs']), callbacks=[on_epoch_end_callback(notebook=notebook)]) # save model separately as model weights could not be pickled model.save("NOTEBOOK_" + data['notebook_name'] + "_neural_network_model.hdf5") notebook['model'] = model.to_json() set_notebook_data(data['notebook_name']) try: keras.backend.clear_session() except: pass return json_encoder.encode({ "message": "Success", "comment": "Compiled model and trained" })
def select_devices( num_gpus_to_use=0, max_load=0.01, max_memory=0.01, exclude_gpu_ids=None ): if num_gpus_to_use == 0: os.environ["CUDA_VISIBLE_DEVICES"] = "" else: if exclude_gpu_ids is None: exclude_gpu_ids = [] gpu_to_use = GPUtil.getAvailable( order="first", limit=num_gpus_to_use, maxLoad=max_load, maxMemory=max_memory, includeNan=False, excludeID=exclude_gpu_ids, excludeUUID=[], ) if len(gpu_to_use) < num_gpus_to_use: raise OSError( "Couldnt find enough GPU(s) as required by the user, stopping program " "- consider reducing " "the requirements or using num_gpus_to_use=0 to use CPU" ) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( str(gpu_idx) for gpu_idx in gpu_to_use ) print("GPUs selected have IDs {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
def _get_device_map(self): self.logger.info('get devices') run_on_gpu = False device_map = [-1] * self.num_worker if not self.args.cpu: try: import GPUtil num_all_gpu = len(GPUtil.getGPUs()) avail_gpu = GPUtil.getAvailable(order='memory', limit=min(num_all_gpu, self.num_worker)) num_avail_gpu = len(avail_gpu) if num_avail_gpu >= self.num_worker: run_on_gpu = True elif 0 < num_avail_gpu < self.num_worker: self.logger.warning('only %d out of %d GPU(s) is available/free, but "-num_worker=%d"' % (num_avail_gpu, num_all_gpu, self.num_worker)) if not self.args.device_map: self.logger.warning('multiple workers will be allocated to one GPU, ' 'may not scale well and may raise out-of-memory') else: self.logger.warning('workers will be allocated based on "-device_map=%s", ' 'may not scale well and may raise out-of-memory' % self.args.device_map) run_on_gpu = True else: self.logger.warning('no GPU available, fall back to CPU') if run_on_gpu: device_map = ((self.args.device_map or avail_gpu) * self.num_worker)[: self.num_worker] except FileNotFoundError: self.logger.warning('nvidia-smi is missing, often means no gpu on this machine. ' 'fall back to cpu!') self.logger.info('device map: \n\t\t%s' % '\n\t\t'.join( 'worker %2d -> %s' % (w_id, ('gpu %2d' % g_id) if g_id >= 0 else 'cpu') for w_id, g_id in enumerate(device_map))) return device_map
def print_gpu_stat(gpu_id=None): """Print GPU status.""" if gpu_id is None: gpu_ids = GPUtil.getAvailable(limit=10) for gpu_id in gpu_ids: GPU = GPUtil.getGPUs()[gpu_id] GPU_load = GPU.load * 100 GPU_memoryUtil = GPU.memoryUtil / 2.0**10 GPU_memoryTotal = GPU.memoryTotal / 2.0**10 GPU_memoryUsed = GPU.memoryUsed / 2.0**10 GPU_memoryFree = GPU.memoryFree / 2.0**10 print("Current GPU (ID:{:d}) name:\t{:s}".format(gpu_id, GPU.name)) print("Total_GPU_memory:\t{:.3f}GB;".format(GPU_memoryTotal)) print("GPU_memoryUtil:\t{:.3f}GB;".format(GPU_memoryUtil)) print("GPU_memoryUsed:\t{:.3f}GB;".format(GPU_memoryUsed)) print("GPU_memoryFree:\t{:.3f}GB;".format(GPU_memoryFree)) print("GPU_load:\t{:.3f}GB;".format(GPU_load)) else: GPU = GPUtil.getGPUs()[gpu_id] GPU_load = GPU.load * 100 GPU_memoryUtil = GPU.memoryUtil / 2.0**10 GPU_memoryTotal = GPU.memoryTotal / 2.0**10 GPU_memoryUsed = GPU.memoryUsed / 2.0**10 GPU_memoryFree = GPU.memoryFree / 2.0**10 print("Current GPU (ID:{:d}) name:{:s}".format(gpu_id, GPU.name)) print("Total_GPU_memory: {:.3f}GB;".format(GPU_memoryTotal)) print("GPU_memoryUsed:{:.3f}GB;".format(GPU_memoryUsed)) print("GPU_memoryFree:{:.3f}GB;".format(GPU_memoryFree)) print("GPU_load:{:.3f}GB;".format(GPU_load))
def set_free_gpus(num): # num: integer; number of GPUs that shall be allocated # returns: string; listing a total of 'num' available GPUs. list_gpu = GPUtil.getAvailable(limit=num, maxMemory=0.01) print(list_gpu) return str(list_gpu)[1:-1]
def autoset_settings(set_var): """Autoset GPU parameters using CUDA_VISIBLE_DEVICES variables. Return default config if variable not set. :param set_var: Variable to set. Must be of type ConfigSettings """ try: devices = ast.literal_eval(os.environ["CUDA_VISIBLE_DEVICES"]) if type(devices) != list and type(devices) != tuple: devices = [devices] if len(devices) != 0: set_var.GPU = len(devices) set_var.NJOBS = len(devices) warnings.warn("Detecting CUDA devices : {}".format(devices)) except KeyError: set_var.GPU = len( GPUtil.getAvailable(order='first', limit=8, maxLoad=0.5, maxMemory=0.5, includeNan=False)) if not set_var.GPU: warnings.warn( "No GPU automatically detected. Setting SETTINGS.GPU to 0, " + "and SETTINGS.NJOBS to cpu_count.") set_var.GPU = 0 set_var.NJOBS = multiprocessing.cpu_count() else: set_var.NJOBS = set_var.GPU warnings.warn("Detecting {} CUDA devices.".format(set_var.GPU)) return set_var
def get_gpus(self, **kwargs): """Gets a list of qualifying GPU IDs """ max_load = self._round_to_between_0_and_1(1.0 - self.minFreeLoad) max_mem = self._round_to_between_0_and_1(1.0 - self.minFreeMemory) log.debug("GPU Requirements") table = [ ("order", self.priority), ("maxLoad", max_load), ("maxMemory", max_mem), ("excludeID", self.ignoreIDs), ("excludeUUID", self.ignoreUUIDs), ] table = tabulate(table, headers=["Parameter", "Value"], tablefmt="simple") log.debug(table) availableGPUids = GPUtil.getAvailable( order=self.priority, maxLoad=max_load, maxMemory=max_mem, excludeID=self.ignoreIDs, excludeUUID=self.ignoreUUIDs, ) log.debug("GPU Util Found GPU IDs: " + str(availableGPUids)) availableGPUids = self._filter_gpus(availableGPUids) log.debug("Filtered GPU IDs are: " + str(availableGPUids)) return availableGPUids
def check_if_gpu(): gpus = GPUtil.getAvailable() if (len(gpus) == 0): print('No gpu found') else: print(gpus)
def main(conf_name, gpu): # Initialize configs and prepare result dir with date if conf_name is None: conf = configs.Config() else: conf = configs.X2_REAL_CONF # conf = None # exec ('conf = configs.%s' % conf_name) res_dir = prepare_result_dir(conf) local_dir = os.path.dirname(__file__) # We take all png files that are not ground truth files = [file_path for file_path in glob.glob('%s/*.png' % conf.input_path) if not file_path[-7:-4] == '_gt'] # Loop over all the files for file_ind, input_file in enumerate(files): # Ground-truth file needs to be like the input file with _gt (if exists) ground_truth_file = input_file[:-4] + '_gt.png' if not os.path.isfile(ground_truth_file): ground_truth_file = '0' # Numeric kernel files need to be like the input file with serial number kernel_files = ['%s_%d.mat;' % (input_file[:-4], ind) for ind in range(len(conf.scale_factors))] kernel_files_str = ''.join(kernel_files) for kernel_file in kernel_files: if not os.path.isfile(kernel_file[:-1]): kernel_files_str = '0' print('no kernel loaded') break print(kernel_files) # This option uses all the gpu resources efficiently if gpu == 'all': # Stay stuck in this loop until there is some gpu available with at least half capacity gpus = [] while not gpus: gpus = GPUtil.getAvailable(order='memory') # Take the gpu with the most free memory cur_gpu = gpus[-1] # Run ZSSR from command line, open xterm for each run os.system("xterm -hold -e " + conf.python_path + " %s/run_ZSSR_single_input.py '%s' '%s' '%s' '%s' '%s' '%s' alias python &" % (local_dir, input_file, ground_truth_file, kernel_files_str, cur_gpu, conf_name, res_dir)) # Verbose print('Ran file #%d: %s on GPU %d\n' % (file_ind, input_file, cur_gpu)) # Wait 5 seconds for the previous process to start using GPU. if we wouldn't wait then GPU memory will not # yet be taken and all process will start on the same GPU at once and later collapse. sleep(5) # The other option is just to run sequentially on a chosen GPU. else: run_ZSSR_single_input.main(input_file, ground_truth_file, kernel_files_str, gpu, conf_name, res_dir)
def get_available_gpus(self, limit): exclude = self.exclude_gpus + [p['gpu_idx'] for p in self.active_procs] return GPUtil.getAvailable(order='random', limit=limit, maxLoad=self.gpu_max_load, maxMemory=self.gpu_max_mem, excludeID=exclude)
def parallel(func: Callable, filelist: Iterable, use_gpu: bool = False, nbprocesses: int = None) -> None: """Parallel processing with multiprocessing.Pool(), works better with functools.partial(). If ``use_gpu`` is True, ``gpu_queue`` will be passed to ``func`` as a keyword argument. The input ``func`` needs to handle the keyword parameter ``gpu_queue`` and select the GPU with gpu_queue.get(). Don't forget to put the GPU id back to the gpu_queue at the end of ``func``. Parameters ---------- func : `Callable` The target function for parallel processing. filelist : `Iterable` The file list to process with the input function. use_gpu : `bool`, optional True for running NN-based PCC algs., False otherwise. Defaults to False. nbprocesses : `int`, optional Specify the number of cpu parallel processes. If None, it will equal to the cpu count. Defaults to None. Raises ------ `ValueError` No available GPU. """ if use_gpu is True: # Get the number of available GPUs deviceIDs = GPUtil.getAvailable(order='first', limit=8, maxLoad=0.5, maxMemory=0.2, includeNan=False, excludeID=[], excludeUUID=[]) process = len(deviceIDs) if process <= 0: logger.error( "No available GPU. Check with the threshold parameters " "of ``GPUtil.getAvailable()``") raise ValueError manager = Manager() gpu_queue = manager.Queue() for id in deviceIDs: gpu_queue.put(id) pfunc = partial(func, gpu_queue=gpu_queue) else: process = nbprocesses pfunc = func with Pool(process) as pool: list(tqdm(pool.imap_unordered(pfunc, filelist), total=len(filelist)))
def get_dev_one(mem): devs = GPUtil.getAvailable(order='memory', maxLoad=1, maxMemory=mem, limit=n) devs = _limit(devs, ok) if len(devs) >= n: logging.info('available {}'.format(devs)) return devs else: return []
def set_gpu(): gpu = GPUtil.getAvailable(limit=3, excludeID=[0, 1]) vis_gpu = "" for g in gpu: vis_gpu += ", " + str(g) vis_gpu = vis_gpu[1:] os.environ["CUDA_VISIBLE_DEVICES"] = vis_gpu print("Setting GPUS: ", vis_gpu)
async def get_gpu_info(): return GPUtil.getAvailable(order='memory', limit=10, maxLoad=0.25, maxMemory=0.25, includeNan=False, excludeID=[], excludeUUID=[])
def main(): max_devices = 16 # Check which devices we have locally available_devices = GPUtil.getAvailable(limit=max_devices) # Use one worker per device cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4) client = Client(cluster) # Set up a relatively large regression problem n = 100 m = 10000000 partition_size = 100000 X = da.random.random((m, n), partition_size) y = da.random.random(m, partition_size) xgb.dask.run(client, train, X, y, available_devices)
import os import subprocess import GPUtil deviceIDs = GPUtil.getAvailable(order='first', limit=3, maxLoad=0.5, maxMemory=0.5) print(','.join(str(e) for e in deviceIDs)) task_queue_file = os.path.join(os.environ.get("HOME", None), "task_queue.txt") if not os.path.isfile(task_queue_file): sys.exit(0) task_list = open(task_queue_file).readlines() print(task_list) for i in range(min(len(deviceIDs), len(task_list))): task = task_list[i].strip() + " --gpu-id=%d" % i print(task) subprocess.Popen(task, shell=True, cwd="/home/xyang22/project/research/active-learning-dnn")
def available_gpu(*args, **kwargs): """This function is an alias for ``GPUtil.getAvailable``. If ``GPUtil`` is not installed, it returns [0,] as a default GPU ID.""" return GPUtil.getAvailable(*args, **kwargs)