def get_gpu_info(): """ 获取 GPU 信息 :return: """ gpu_list = [] GPUtil.showUtilization() # 获取多个GPU的信息,存在列表里 for gpu in GPUtil.getGPUs(): # print('gpu.id:', gpu.id) # print('GPU总量:', gpu.memoryTotal) # print('GPU使用量:', gpu.memoryUsed) # print('gpu使用占比:', gpu.memoryUtil * 100) # 内存使用率 # print('gpu load:', gpu.load * 100) # 使用率 # 按GPU逐个添加信息 gpu_list.append({ "gpu_id": gpu.id, "gpu_memoryTotal": gpu.memoryTotal, "gpu_memoryUsed": gpu.memoryUsed, "gpu_memoryUtil": gpu.memoryUtil * 100, "gpu_load": gpu.load * 100 }) return gpu_list
def _create_run_config(self): save_checkpoints_steps = self.params[SAVE_CHECKPOINTS_STEPS] save_summary_steps = self.params[SAVE_SUMMARY_STEPS] keep_checkpoint_max = self.params[KEEP_CHECKPOINT_MAX] gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) f = [ s.function for s in inspect.stack() if ntpath.basename(s.filename) == "thread_handler.py" and s.function in [ "run_thread", "predict_thread", "explain_thread", "predict_test_thread" ] ][-1] if f == "predict" or (f != "run" and len(GPUtil.getAvailable()) == 0): config.device_count.update({"GPU": 0}) self.runConfig = tf.estimator.RunConfig( model_dir=self.checkpoint_dir, save_checkpoints_steps=save_checkpoints_steps, save_summary_steps=save_summary_steps, keep_checkpoint_max=keep_checkpoint_max, session_config=config, )
def get_device(cpu_force=False): cpu_device = torch.device('cpu') if cpu_force or not torch.cuda.is_available(): print('Using force cpu' if cpu_force else 'There is no available gpus') return cpu_device gpus = GPUtil.getAvailable(order='memory', maxMemory=0.9, maxLoad=0.9) if len(gpus) > 0: return torch.device(f'cuda:{gpus[0]}') return cpu_device
def single_exp_runner(task_fn, auto_choose_gpu_flag=False, gpu_id: int = 0, seed=None, del_if_log_path_existed=False, keep_session=False, **task_fn_kwargs): """ :param task_fn: task function defined bu users :type task_fn: method :param auto_choose_gpu_flag: auto choose gpu, default False :type auto_choose_gpu_flag: bool :param gpu_id: gpu id, default 0 :type gpu_id: int :param seed: seed generated by system time :type seed: int :param del_if_log_path_existed:delete obsolete log file path if existed, by default False :type del_if_log_path_existed: bool :param task_fn_kwargs: :type task_fn_kwargs: :param keep_session: Whether to keep default session & graph :type keep_session: :return: :rtype: """ os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID" if auto_choose_gpu_flag is True: DEVICE_ID_LIST = Gpu.getFirstAvailable() DEVICE_ID = DEVICE_ID_LIST[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) else: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) if not seed: seed = int(round(time.time() * 1000)) % (2**32 - 1) _reset_global_seed(seed, keep_session) print("create log path at {}".format(GlobalConfig().DEFAULT_LOG_PATH), flush=True) file.create_path(path=GlobalConfig().DEFAULT_LOG_PATH, del_if_existed=del_if_log_path_existed) Logger().init(config_or_config_dict=dict(), log_path=GlobalConfig().DEFAULT_LOG_PATH, log_level=GlobalConfig().DEFAULT_LOG_LEVEL) ConsoleLogger().init( to_file_flag=GlobalConfig().DEFAULT_WRITE_CONSOLE_LOG_TO_FILE_FLAG, to_file_name=os.path.join( GlobalConfig().DEFAULT_LOG_PATH, GlobalConfig().DEFAULT_CONSOLE_LOG_FILE_NAME), level=GlobalConfig().DEFAULT_LOG_LEVEL, logger_name=GlobalConfig().DEFAULT_CONSOLE_LOGGER_NAME) task_fn(**task_fn_kwargs)
def startup_fn(): try: from GPUtil import GPUtil import math import os # Disable GPUS if desired if "CHIA_CPU_ONLY" in os.environ.keys(): if os.environ["CHIA_CPU_ONLY"] == "1": print("Requested CPU only operation." "Disabling all GPUS via environment variable.") os.environ["CUDA_VISIBLE_DEVICES"] = "-1" raise ValueError("Requested CPU-only operation.") gpus = GPUtil.getGPUs() if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): cuda_filter = os.environ["CUDA_VISIBLE_DEVICES"].split(",") print(f"Found CUDA device filter: {cuda_filter}") gpus = [gpu for gpu in gpus if str(gpu.id) in cuda_filter] available_gpus = [] for gpu in gpus: print(f"Found GPU: {gpu.name}") if gpu.memoryUtil < 0.5: available_gpus += gpus else: print("Cannot use this GPU because of its " f"memory utilization @ {int(100.0 * gpu.memoryUtil)}%.") if len(available_gpus) > 1: print("Only one GPU is supported right now.") return False if len(available_gpus) < 1: print("Need an available GPU!") return False gpu = available_gpus[0] configuration.set_system("gpu0_vram", math.trunc(gpu.memoryTotal / 102.4) / 10) except Exception as ex: print( f"Could not read available VRAM: {str(ex)}. Setting default value of 4GiB." ) configuration.set_system("gpu0_vram", 4.0) print(f"GPU0: {configuration.get_system('gpu0_vram')} Gib VRAM") return True
def get_empty_gpu() -> int: # Get a GPU gpus = None sleeps = 0 while gpus is None or len(gpus) == 0: if gpus is not None: time.sleep(10) sleeps += 1 if sleeps > 10: raise Exception("No available GPU to allocate") try: gpus = GPUtil.getFirstAvailable(order='random', maxLoad=0.05, maxMemory=0.05) except: gpus = [] return gpus[0]
def _create_run_config(self): save_checkpoints_steps = self.params[SAVE_CHECKPOINTS_STEPS] save_summary_steps = self.params[SAVE_SUMMARY_STEPS] keep_checkpoint_max = self.params[KEEP_CHECKPOINT_MAX] gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) f = [ s.function for s in inspect.stack() if s.filename.split('/')[-1] == 'dfweb.py' and s.function != 'check_session' ][-1] if f == 'predict' or (f != 'run' and len(GPUtil.getAvailable()) == 0): config.device_count.update({'GPU': 0}) self.runConfig = tf.estimator.RunConfig( model_dir=self.checkpoint_dir, save_checkpoints_steps=save_checkpoints_steps, save_summary_steps=save_summary_steps, keep_checkpoint_max=keep_checkpoint_max, session_config=config)
def get_random_gpu(): gpus = GPUtil.getAvailable(order='memory', maxMemory=0.9, maxLoad=0.9) if len(gpus) > 0: return torch.device(f'cuda:{gpus[0]}') raise RuntimeError('No gpu found')
def update(self): ram_info = psutil.virtual_memory() disk_info = psutil.disk_partitions() disk_memory_info = psutil.disk_usage(disk_info[0].mountpoint) gpu_info = GPUtil.getGPUs()[0] self.config.update({ 'processor': { 'architecture': platform.processor(), 'total_cores': psutil.cpu_count(logical=True), 'max_frequency': f'{psutil.cpu_freq().max}Mhz', 'current_frequency': f'{psutil.cpu_freq().current:.2f}Mhz', 'loading': f'{psutil.cpu_percent()}%', 'usage_per_core': self.get_cpu_per_core(), }, 'disk': { 'file_system_type': disk_info[0].fstype, 'total_memory': self.get_size(disk_memory_info.total), 'available': self.get_size(disk_memory_info.free), 'used': self.get_size(disk_memory_info.used), 'used_in_percents': f'{disk_memory_info.percent}%', }, 'ram': { 'total_memory': self.get_size(ram_info.total), 'available': self.get_size(ram_info.available), 'used': self.get_size(ram_info.used), 'used_in_percents': f'{ram_info.percent}%', }, 'gpu': { 'name': gpu_info.name, 'temperature': f'{gpu_info.temperature}°C', 'loading': f'{gpu_info.load}%', 'total_memory': f'{gpu_info.memoryTotal}MB', 'available': f'{gpu_info.memoryFree}MB', 'used': f'{gpu_info.memoryUsed}MB', 'used_in_percents': f'{gpu_info.memoryUsed / gpu_info.memoryTotal * 100:.2f}%' } }) if self.system_name == OSName.LINUX.value: self.config['processor'].update({ 'name': Computer().processor.name, 'architecture': platform.processor(), 'temperature': f'{psutil.sensors_temperatures()["k10temp"][0].current}°C' }) if self.system_name == OSName.WINDOWS.value: self.config['processor'].update({ 'name': self.get_processor_name(), 'architecture': str(platform.architecture()[0]), 'temperature': f'Unavailable for OS Windows' })
def __init__(self, token: str): os_info = platform.uname() self.system_name = os_info.system ram_info = psutil.virtual_memory() disk_info = psutil.disk_partitions() disk_memory_info = psutil.disk_usage(disk_info[0].mountpoint) gpu_info = GPUtil.getGPUs()[0] self.config = { 'token': token, 'os': { 'name': self.system_name, 'version': os_info.version, }, 'processor': { 'total_cores': psutil.cpu_count(logical=True), 'max_frequency': f'{psutil.cpu_freq().max}Mhz', 'current_frequency': f'{psutil.cpu_freq().current:.2f}Mhz', 'loading': f'{psutil.cpu_percent()}%', 'usage_per_core': self.get_cpu_per_core(), }, 'socket_info': { 'host': socket.gethostname(), 'ip_address': socket.gethostbyname(socket.gethostname()), 'mac_address': ':'.join(re.findall('..', '%012x' % uuid.getnode())), }, 'disk': { 'file_system_type': disk_info[0].fstype, 'total_memory': self.get_size(disk_memory_info.total), 'available': self.get_size(disk_memory_info.free), 'used': self.get_size(disk_memory_info.used), 'used_in_percents': f'{disk_memory_info.percent}%', }, 'ram': { 'total_memory': self.get_size(ram_info.total), 'available': self.get_size(ram_info.available), 'used': self.get_size(ram_info.used), 'used_in_percents': f'{ram_info.percent}%', }, 'gpu': { 'name': gpu_info.name, 'temperature': f'{gpu_info.temperature}°C', 'loading': f'{gpu_info.load}%', 'total_memory': f'{gpu_info.memoryTotal}MB', 'available': f'{gpu_info.memoryFree}MB', 'used': f'{gpu_info.memoryUsed}MB', 'used_in_percents': f'{gpu_info.memoryUsed / gpu_info.memoryTotal * 100:.2f}%' } } if self.system_name == OSName.LINUX.value: self.config['processor'].update({ 'name': Computer().processor.name, 'architecture': platform.processor(), 'temperature': f'{psutil.sensors_temperatures()["k10temp"][0].current}°C' }) if self.system_name == OSName.WINDOWS.value: self.config['processor'].update({ 'name': self.get_processor_name(), 'architecture': platform.architecture()[0], 'temperature': 'Unavailable for OS Windows' })
if PROJECT.startswith("stackex"): reader, train_ds, val_ds, test_ds = load_stackex_data( DATA_ROOT, USER_ID_FILE, args.encoder, args.data_type, args.max_seq_len, args.toy_data) else: reader, train_ds, val_ds, test_ds = load_br_data(DATA_ROOT, USER_ID_FILE, args.encoder, args.data_type, args.max_seq_len, args.toy_data) num_authors = reader.get_author_num() print("User Number: ", str(num_authors)) # choose available gpu if args.device < 0: avail_gpus = GPUtil.getFirstAvailable(maxMemory=0.1, maxLoad=0.05) # avail_gpus = GPUtil.getFirstAvailable(maxMemory=0.9, maxLoad=0.9) if len(avail_gpus) == 0: print("No GPU available!!!") sys.exit() args.device = avail_gpus[0] print("### Use GPU", str(args.device), "###") torch.cuda.set_device(args.device) # val_size = int(len(train_ds) * args.val_ratio) # val_ds = train_ds[-val_size:] # train_ds = train_ds[:-val_size] # if args.test_ratio != 1: # print("Testing ratio:", args.test_ratio) # test_size = int(len(test_ds) * args.test_ratio)