def test_new_query_mocked(self, N, Process): """ A basic functionality test, in a case where everything is just normal. """ _configure_mock(N, Process) gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted(fp=fp, no_color=False, show_user=True, show_cmd=True, show_pid=True) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) # remove first line (header) unescaped = '\n'.join(unescaped.split('\n')[1:]) expected = """\ [0] GeForce GTX TITAN 0 | 80'C, 76 % | 8000 / 12287 MB | user1:python/48448(4000M) user2:python/153223(4000M) [1] GeForce GTX TITAN 1 | 36'C, 0 % | 9000 / 12189 MB | user1:torch/192453(3000M) user3:caffe/194826(6000M) [2] GeForce GTX TITAN 2 | 71'C, ?? % | 0 / 12189 MB | (Not Supported) """ self.maxDiff = 4096 self.assertEqual(unescaped, expected)
def test_new_query_mocked(self, scenario_basic): """ A basic functionality test, in a case where everything is just normal. """ gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted( fp=fp, no_color=False, show_user=True, show_cmd=True, show_full_cmd=True, show_pid=True, show_fan_speed=True, show_codec="enc,dec", show_power=True, ) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) # remove first line (header) unescaped = os.linesep.join(unescaped.splitlines()[1:]) assert unescaped == MOCK_EXPECTED_OUTPUT_FULL_PROCESS
def main(args): global orig_stdout # sanity check for stupid researchers if "CUDA_VISIBLE_DEVICES" not in os.environ: gpus = gpustat.new_query().jsonify()['gpus'] if len(gpus) > 2: lowest_mem, lowest_index = (1000000, "") for gpu in gpus: if gpu['memory.used'] < lowest_mem: lowest_mem = gpu['memory.used'] lowest_index = gpu['index'] if lowest_mem > 2000: print( "No GPU is available for now, try again later or leave a message after the tone *beep*" ) print( "If you want to run on CPU, set CUDA_VISIBLE_DEVICES to the right value" ) exit(-2) lowest_index = str(lowest_index) os.environ["CUDA_VISIBLE_DEVICES"] = lowest_index with std_out_err_redirect_tqdm() as orig_stdout: if args.sub == "eval": eval_network(args.config_file, args.pretrained_model, act_time=args.logdir, create_logs=not args.no_logs, args=args) else: return train(config_file=args.config_file, create_logs=not args.no_logs, act_time=args.logdir, args=args)
def getNvidiaInfo(_=None): stat = gpustat.new_query() return dict(names=[g.name for g in stat.gpus], memused=[g.memory_used for g in stat.gpus], memtotal=[g.memory_total for g in stat.gpus], loads=[g.utilization for g in stat.gpus])
def test_new_query_mocked(self, N, Process, virtual_memory): """ A basic functionality test, in a case where everything is just normal. """ _configure_mock(N, Process, virtual_memory) gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted(fp=fp, no_color=False, show_user=True, show_cmd=True, show_pid=True, show_power=True, show_fan_speed=True, show_full_cmd=True) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) # remove first line (header) unescaped = os.linesep.join(unescaped.splitlines()[1:]) self.maxDiff = 4096 self.assertEqual(unescaped, MOCK_EXPECTED_OUTPUT_FULL_PROCESS)
def main(): cmd = sys.argv[1:] n_gpus = 1 if len(sys.argv) >= 3 and sys.argv[1] == '-n': n_gpus = int(sys.argv[2]) cmd = sys.argv[3:] def is_free(gpu): return len(gpu.processes) == 0 gpus = new_query() memory_used = [(gpu.memory_used, gpu.index) for gpu in gpus if is_free(gpu)] if len(memory_used) < n_gpus: print("sorry, there are not enough free gpus right now :(") exit(1) memory_used.sort() indices = [str(idx) for mu, idx in memory_used[:n_gpus]] env = os.environ env['CUDA_VISIBLE_DEVICES'] = ','.join(indices) os.execlpe(cmd[0], *cmd, env)
def test_new_query_mocked_nonexistent_pid(self, N, Process): """ Test a case where nvidia query returns non-existent pids (see #16, #18) """ _configure_mock(N, Process, scenario_nonexistent_pid=True) gpustats = gpustat.new_query() gpustats.print_formatted(fp=sys.stdout)
def taLibs_getBatch_ViaGPU_Size(B): s0 = gpustat.new_query() GPU_GB = s0.gpus[0].memory_total // 1000 MB = list(B.keys()) k = 0 while (MB[k] < GPU_GB): k += 1 return B[MB[k]]
def get_process_gpu_memory(gpu_id=0): stat = gpustat.new_query() process = [ xi for xi in stat.gpus[gpu_id].processes if xi['pid'] == os.getpid() ] # noqa if len(process) == 1: return process[0]['gpu_memory_usage'] else: return 0
def show_gpu_chooser(default=0, override=None): if override != None: if override.isdecimal(): idx = int(override) if cuda.is_available() and idx in range(cuda.device_count()): return "cuda:{}".format(idx) return override if not cuda.is_available(): return "cpu" gpustat.new_query().print_formatted(no_color=True) idx = input("Choose GPU (default {}):".format(default)) if idx == "cpu": return "cpu" if idx.isdecimal(): idx = int(idx) if idx in range(cuda.device_count()): return "cuda:{}".format(idx) return "cuda:{}".format(default)
def get_gpu_info(): # Note this only works for nvidia gpus atm try: stats = gpustat.new_query() except: return {"have_gpu": False, "gpus": []} return { "have_gpu": bool(len(stats.gpus)), "gpus": [x["name"] for x in stats.gpus], }
def test_json_mocked(self, N, Process): _configure_mock(N, Process) gpustats = gpustat.new_query() fp = StringIO() gpustats.print_json(fp=fp) import json j = json.loads(fp.getvalue()) print(j)
def test_json_mocked(self, N, Process, virtual_memory): _configure_mock(N, Process, virtual_memory) gpustats = gpustat.new_query() fp = StringIO() gpustats.print_json(fp=fp) import json j = json.loads(fp.getvalue()) print(j)
def auto_select_gpu(assigned_gpu_id=None, ngpus=1): """ assigned_gpu_id is the main deviceId ngpus denote the total number of gpus """ gpu_id_list = os.getenv('CUDA_VISIBLE_DEVICES') gpu_stats_list = gpustat.new_query() if gpu_id_list == None: gpu_id_list = [g['index'] for g in gpu_stats_list] else: gpu_id_list = [int(value.strip()) for value in gpu_id_list.split(',')] device_compute_rest = get_gpu_compute_rest(gpu_stats_list, gpu_id_list) device_first_level, device_second_level, device_third_level = {}, {}, {} for i in device_compute_rest: computeRestRate, memRestRate = device_compute_rest[i] if memRestRate > 0.5 and computeRestRate > 0.5: device_first_level[i] = math.sqrt(memRestRate * computeRestRate) elif memRestRate > 0.3 and computeRestRate > 0.3: device_second_level[i] = math.sqrt(memRestRate * computeRestRate) else: device_third_level[i] = math.sqrt(memRestRate * computeRestRate) best, valid_gpus = -1, [] if assigned_gpu_id: best = assigned_gpu_id if best not in gpu_id_list: best = -1 print( "WARNING: Manually selected main gpu index is out of range! We will use autoselected gpu!" ) def sort_candidate_gpus(devices): if len(devices) == 0: return [] else: sort_list, _ = list( zip(*sorted(list(devices.items()), key=lambda x: x[1]))) return list(sort_list) sort_first_list = sort_candidate_gpus(device_first_level) sort_second_list = sort_candidate_gpus(device_second_level) sort_third_list = sort_candidate_gpus(device_third_level) valid_gpus = sort_first_list + sort_second_list + sort_third_list if len(valid_gpus) < ngpus: raise ValueError('[Error]: not enough available gpus') valid_gpus = valid_gpus[:ngpus] if best == -1: best = valid_gpus[0] if best not in valid_gpus: valid_gpus[-1] = best best = gpu_id_list.index(best) gpu_name = gpu_stats_list[best]['name'] valid_gpus = [gpu_id_list.index(each) for each in valid_gpus] return best, gpu_name, valid_gpus
def get_available_gpu_memory_list(): ret = gpustat.new_query() if 'CUDA_VISIBLE_DEVICES' in os.environ: visible_devices = [ int(i) for i in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] return [ gpu.memory_available for i, gpu in enumerate(ret.gpus) if i in visible_devices ] return [gpu.memory_available for gpu in ret.gpus]
def get_free_gpus(bytes_needed=0): free_gpus = dict() gpu_stats = gpustat.new_query() for i in range(cuda.device_count()): bytes_free = 2**20 * (gpu_stats[i]["memory.total"] - gpu_stats[i]["memory.used"]) if bytes_free > bytes_needed: free_gpus[i] = bytes_free free_gpus = dict( sorted(free_gpus.items(), key=lambda gpu: gpu[1], reverse=True)) return list(free_gpus.keys())
def main(): stats = gpustat.new_query() cpu_usage = commands.getstatusoutput( "top -b -n 2 | grep %Cpu | sed -n 2p | awk -F',' '{{print $4}}'")[1] cpu_usage = 100 - float(cpu_usage.split(' ')[1]) cpu_usage = "{}%".format(cpu_usage) data_json = stats.jsonify() data_json["cpu.usage"] = cpu_usage data_json = jsonify(data_json) return data_json
def test_json_mocked(self, scenario_basic): gpustats = gpustat.new_query() fp = StringIO() gpustats.print_json(fp=fp) import json j = json.loads(fp.getvalue()) from pprint import pprint pprint(j)
def get_process_memory(): """ Returns both RAM and GPU memory for current process (in MB). """ used_gpu = 0 for gpu in gpustat.new_query().gpus: for proc in gpu.processes: if proc["pid"] == os.getpid(): used_gpu = proc["gpu_memory_usage"] process = psutil.Process(os.getpid()) used_ram = process.memory_info().rss * 1e-6 return used_ram, used_gpu
def gpu_info(): global gpu_info_data, gpu_info_expires try: if gpu_info_expires < time.time(): gpu_info_data = None if gpu_info_data is None: query_result = gpustat.new_query() gpu_info_data = [dict(gpu) for gpu in query_result] gpu_info_expires = time.time() + 5 except: gpu_info_data = [] return gpu_info_data
def get_hardware(): """ Return some hardware info... pip install -U gpustat pip install -U py-cpuinfo import gpustat from cpuinfo import get_cpu_info """ cpus = get_cpu_info() cuda_var = "CUDA_VISIBLE_DEVICES" try: gpus = { "used": os.environ[cuda_var] if cuda_var in os.environ else "all", "devices": {str(i): v.entry for i, v in enumerate(gpustat.new_query())}, } gpus_ = {"used": gpus["used"], "devices": {}} for gpuid, gpu in gpus["devices"].items(): gpus_["devices"][gpuid] = { key: gpu[key] for key in ["name", "uuid"] } except: gpus = {} gpus_ = {} vram = psutil.virtual_memory() # shorter version for printing cpus_ = { key: cpus[key] for key in [ "brand_raw", "count", "hz_advertised_friendly", "hz_actual_friendly", ] } return { "full": { "cpu": cpus, "gpu": gpus, "vram": str(vram) }, "short": { "cpu": cpus_, "gpu": gpus_, "vram": vram.total }, }
def get_first_available_gpu(): """ Check if a gpu is free and returns it :return: gpu_id """ query = gpustat.new_query() for gpu_id in range(len(query)): gpu = query[gpu_id] if gpu.memory_used < 700: has = os.system("tmux has-session -t " + f"GPU{gpu_id}" + " 2>/dev/null") if not int(has) == 0: return gpu_id return -1
def _machine_stats(): """ :return: machine stats dictionary, all values expressed in megabytes """ cpu_usage = [float(v) for v in psutil.cpu_percent(percpu=True)] stats = { "cpu_usage": sum(cpu_usage) / float(len(cpu_usage)), } bytes_per_megabyte = 1024**2 def bytes_to_megabytes(x): return x / bytes_per_megabyte virtual_memory = psutil.virtual_memory() stats["memory_used_gb"] = bytes_to_megabytes( virtual_memory.used) / 1024 stats["memory_free_gb"] = bytes_to_megabytes( virtual_memory.available) / 1024 disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent stats["disk_free_percent"] = 100.0 - disk_use_percentage sensor_stat = (psutil.sensors_temperatures() if hasattr( psutil, "sensors_temperatures") else {}) if "coretemp" in sensor_stat and len(sensor_stat["coretemp"]): stats["cpu_temperature"] = max( [float(t.current) for t in sensor_stat["coretemp"]]) # update cached measurements net_stats = psutil.net_io_counters() stats["network_tx_mbs"] = bytes_to_megabytes(net_stats.bytes_sent) stats["network_rx_mbs"] = bytes_to_megabytes(net_stats.bytes_recv) io_stats = psutil.disk_io_counters() stats["io_read_mbs"] = bytes_to_megabytes(io_stats.read_bytes) stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes) # check if we can access the gpu statistics if gpustat: gpu_stat = gpustat.new_query() for i, g in enumerate(gpu_stat.gpus): stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"]) stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"]) stats["gpu_%d_mem_usage" % i] = 100. * float( g["memory.used"]) / float(g["memory.total"]) # already in MBs stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024 stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024 return stats
def test_new_query_mocked(self): gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted(fp=fp, no_color=False, show_user=True, show_cmd=True, show_pid=True) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) self.assertEqual(unescaped, """\ [0] GeForce GTX TITAN X | 80'C, 76 % | 8000 / 12287 MB | user1:python/48448(4000M) user2:python/153223(4000M) [1] GeForce GTX TITAN X | 36'C, 0 % | 9000 / 12287 MB | user1:torch/192453(3000M) user3:caffe/194826(6000M) [2] GeForce GTX TITAN X | 71'C, ?? % | 8520 / 12287 MB | user3:python/38310(4245M) --:--/--(?M) """)
async def query_nvml_health(): # get a bunch of data from NVML for each GPU and pick the stuff we want gpu_stats = [] for stat in gpustat.new_query(): gpu_stats.append({ 'gpu_no': stat.index, 'fan_speed': stat.fan_speed, 'temperature': stat.temperature, 'power_draw': stat.power_draw, 'power_limit': stat.power_limit, 'core_clock': stat.core_clock, 'mem_clock': stat.mem_clock, }) if LOGLEVEL == "DEBUG": print(gpu_stats) return gpu_stats
def get_cpu_memory_gpu(): cpu_usage = psutil.cpu_percent(interval=1) virtual_memory = psutil.virtual_memory() memory_usage = virtual_memory.used / 1024 / 1024 / 1024 try: query = gpustat.new_query() except Exception: gpu_memory = 0 gpu_util = 0 else: gpu = query.gpus[0] gpu_memory = 100.0 * gpu.memory_used / gpu.memory_total gpu_util = gpu.utilization return cpu_usage, memory_usage, gpu_memory, gpu_util
def test_attributes_and_items(self): g = gpustat.new_query()[2] # includes N/A print("(keys) : %s" % str(g.keys())) print(g) self.assertEqual(g['name'], g.entry['name']) self.assertEqual(g['uuid'], g.uuid) with self.assertRaises(KeyError): g['unknown_key'] print("uuid : %s" % g.uuid) print("name : %s" % g.name) print("memory : used %d total %d avail %d" % ( g.memory_used, g.memory_total, g.memory_available)) print("temperature : %d" % (g.temperature)) print("utilization : %s" % (g.utilization))
def test_attributes_and_items(self): g = gpustat.new_query()[2] # includes N/A print("(keys) : %s" % str(g.keys())) print(g) self.assertEqual(g['name'], g.entry['name']) self.assertEqual(g['uuid'], g.uuid) with self.assertRaises(KeyError): g['unknown_key'] print("uuid : %s" % g.uuid) print("name : %s" % g.name) print("memory : used %d total %d avail %d" % (g.memory_used, g.memory_total, g.memory_available)) print("temperature : %d" % (g.temperature)) print("utilization : %s" % (g.utilization))
def get_first_available_gpu(): """ Check if a gpu is free and returns it :return: gpu_id """ query = gpustat.new_query() for gpu_id in range(len(query)): gpu = query[gpu_id] print(gpu_id, gpu.memory_used) if gpu.memory_used < 2000: if gpu.utilization == 0 and gpu.memory_used < 12 and gpu_id == 0 and gpu.processes.__len__( ) == 0: os.system(f"tmux kill-session -t GPU{gpu_id}") has = os.system(f"tmux has-session -t GPU{gpu_id} 2>/dev/null") if not int(has) == 0: return gpu_id return -1
def test_new_query_mocked_nonexistent_pid(self, scenario_nonexistent_pid): """ Test a case where nvidia query returns non-existent pids (see #16, #18) for GPU index 2. """ fp = StringIO() gpustats = gpustat.new_query() gpustats.print_formatted(fp=fp) ret = fp.getvalue() print(ret) # gpu 2: should ignore process id line = remove_ansi_codes(ret).split('\n')[3] assert '[2] GeForce GTX TITAN 2' in line, str(line) assert '99999' not in line assert '(Not Supported)' not in line
def test_new_query_mocked(self): gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted(fp=fp, no_color=False, show_user=True, show_cmd=True, show_pid=True) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) self.assertEqual( unescaped, """\ [0] GeForce GTX TITAN X | 80'C, 76 % | 8000 / 12287 MB | user1:python/48448(4000M) user2:python/153223(4000M) [1] GeForce GTX TITAN X | 36'C, 0 % | 9000 / 12287 MB | user1:torch/192453(3000M) user3:caffe/194826(6000M) [2] GeForce GTX TITAN X | 71'C, ?? % | 8520 / 12287 MB | user3:python/38310(4245M) --:--/--(?M) """)
def test_attributes_and_items(self, scenario_basic): """Test whether each property of `GPUStat` instance is well-defined.""" g = gpustat.new_query()[1] # includes N/A print("(keys) : %s" % str(g.keys())) print(g) assert g['name'] == g.entry['name'] assert g['uuid'] == g.uuid with pytest.raises(KeyError): g['unknown_key'] print("uuid : %s" % g.uuid) print("name : %s" % g.name) print("memory : used %d total %d avail %d" % ( g.memory_used, g.memory_total, g.memory_available)) print("temperature : %d" % (g.temperature)) print("utilization : %s" % (g.utilization)) print("utilization_enc : %s" % (g.utilization_enc)) print("utilization_dec : %s" % (g.utilization_dec))
def auto_select_gpu(assigned_gpu_id=None): gpu_id_list = os.getenv('CUDA_VISIBLE_DEVICES') gpu_stats_list = gpustat.new_query() if gpu_id_list == None: gpu_id_list = [g['index'] for g in gpu_stats_list] else: gpu_id_list = [int(value) for value in gpu_id_list.split(',')] if assigned_gpu_id: best = assigned_gpu_id if best >= len(gpu_id_list): print("WARNING: Manually selected gpu index is out of range!") best = 0 gpu_name = gpu_stats_list[gpu_id_list[best]]['name'] else: device_compute_rest = get_gpu_compute_rest(gpu_stats_list, gpu_id_list) device_first_level, device_second_level, device_third_level = {}, {}, {} for i in device_compute_rest: computeRestRate, memRestRate = device_compute_rest[i] if memRestRate > 0.3 and computeRestRate > 0.5: device_first_level[i] = computeRestRate elif memRestRate > 0.1 and memRestRate <= 0.3 and computeRestRate > 0.5: device_second_level[i] = memRestRate else: device_third_level[i] = memRestRate + computeRestRate if len(device_first_level) > 0: best = max(device_first_level.items(), key=lambda x: x[1])[0] print("INFO: Using the first level GPU card") else: if len(device_second_level) > 0: best = max(device_second_level.items(), key=lambda x: x[1])[0] print("WARNING: Using the second level GPU card") else: print("WARNING: Using the third level GPU card") best = max(device_third_level.items(), key=lambda x: x[1])[0] gpu_name = gpu_stats_list[best]['name'] best = gpu_id_list.index(best) valid_gpus = [str(gpu_idx) for gpu_idx in gpu_id_list] return best, gpu_name, ','.join(valid_gpus)
def test_new_query_mocked(self, N, Process): """ A basic functionality test, in a case where everything is just normal. """ _configure_mock(N, Process) gpustats = gpustat.new_query() fp = StringIO() gpustats.print_formatted( fp=fp, no_color=False, show_user=True, show_cmd=True, show_pid=True, show_power=True ) result = fp.getvalue() print(result) unescaped = remove_ansi_codes(result) # remove first line (header) unescaped = '\n'.join(unescaped.split('\n')[1:]) self.maxDiff = 4096 self.assertEqual(unescaped, MOCK_EXPECTED_OUTPUT_FULL)
def test_attributes_and_items(self, N, Process): """ Test whether each property of `GPUStat` instance is well-defined. """ _configure_mock(N, Process) g = gpustat.new_query()[1] # includes N/A print("(keys) : %s" % str(g.keys())) print(g) self.assertEqual(g['name'], g.entry['name']) self.assertEqual(g['uuid'], g.uuid) with self.assertRaises(KeyError): g['unknown_key'] print("uuid : %s" % g.uuid) print("name : %s" % g.name) print("memory : used %d total %d avail %d" % ( g.memory_used, g.memory_total, g.memory_available)) print("temperature : %d" % (g.temperature)) print("utilization : %s" % (g.utilization))