def __init__(self, name=None): self.name = name if name is not None else socket.gethostname() self.cpu_info = get_cpu_info() self.status = deque(maxlen=10000) self.__end = False try: GPUStatCollection.new_query().jsonify() self.__type = 'gpu' except: self.__type = 'cpu' self.__my_hardware_state(interval=0.1) self.__t = threading.Thread(target=self.__get_cpu_percent_loop) self.__t.start()
def get_free_gpu(mode="memory", memory_need=11000) -> list: r"""Get free gpu according to mode (process-free or memory-free). Args: mode (str, optional): memory-free or process-free. Defaults to "memory". memory_need (int): The memory you need, used if mode=='memory'. Defaults to 10000. Returns: list: free gpu ids """ assert mode in [ "memory", "process" ], "mode must be 'memory' or 'process', but got {}".format(mode) if mode == "memory": assert memory_need is not None, "'memory_need' if None, 'memory' mode must give the free memory you want to apply for" memory_need = int(memory_need) assert memory_need > 0, "'memory_need' you want must be positive" gpu_stats = GPUStatCollection.new_query() gpu_free_id_list = [] for idx, gpu_stat in enumerate(gpu_stats): if gpu_check_condition(gpu_stat, mode, memory_need): gpu_free_id_list.append(idx) print("gpu[{}]: {}MB".format(idx, gpu_stat.memory_free)) return gpu_free_id_list
def check(self, instance): try: gpu_stats = GPUStatCollection.new_query() for gpu in gpu_stats.gpus: entry = gpu.entry tags = ['gpu:{}'.format(entry['index'])] self.gauge('gpu.memory.used', entry['memory.used'], tags=tags) self.gauge('gpu.memory.total', entry['memory.total'], tags=tags) self.gauge('gpu.utilization', entry['utilization.gpu'], tags=tags) self.gauge('gpu.temperature', entry['temperature.gpu'], tags=tags) self.gauge('gpu.power.draw', entry['power.draw'], tags=tags) self.gauge('gpu.enforced.power.limit', entry['enforced.power.limit'], tags=tags) except Exception as ex: self.event({ 'timestamp': int(time.time()), 'event_type': 'gpu_stat', 'msg_title': 'Error in gpu stat', 'msg_text': str(ex), })
def my_gpustat(): """ Returns a [safe] version of gpustat for this host. # See `--safe-zone` option of `gpuview start`. # Omit sensitive details, eg. uuid, username, and processes. # Set color flag based on gpu temperature: # bg-warning, bg-danger, bg-success, bg-primary Returns: dict: gpustat """ try: from gpustat import GPUStatCollection stat = GPUStatCollection.new_query().jsonify() delete_list = [] for gpu_id, gpu in enumerate(stat['gpus']): if type(gpu['processes']) is str: delete_list.append(gpu_id) continue gpu['memory'] = round( float(gpu['memory.used']) / float(gpu['memory.total']) * 100) if SAFE_ZONE: gpu['users'] = len( set([p['username'] for p in gpu['processes']])) user_process = [ '%s(%s,%sM)' % (p['username'], p['command'], p['gpu_memory_usage']) for p in gpu['processes'] ] gpu['user_processes'] = ' '.join(user_process) else: gpu['users'] = len( set([p['username'] for p in gpu['processes']])) processes = len(gpu['processes']) gpu['user_processes'] = '%s/%s' % (gpu['users'], processes) gpu.pop('processes', None) gpu.pop("uuid", None) gpu.pop("query_time", None) gpu['flag'] = 'bg-primary' if gpu['temperature.gpu'] > 75: gpu['flag'] = 'bg-danger' elif gpu['temperature.gpu'] > 50: gpu['flag'] = 'bg-warning' elif gpu['temperature.gpu'] > 25: gpu['flag'] = 'bg-success' if delete_list: for gpu_id in delete_list: stat['gpus'].pop(gpu_id) return stat except Exception as e: return {'error': '%s!' % getattr(e, 'message', str(e))}
def gpustat_server(): stats = GPUStatCollection.new_query() rep = Response(json.dumps(stats.jsonify(), default=date_handler), mimetype='application/json') rep.headers = { **rep.headers, **{ 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET' } } return rep
def set_gpus(n_gpus=gpu_settings["n_gpus"], min_vram=gpu_settings["min_vram"], split_gpu_into=gpu_settings["split_gpu_into"]): ''' Configures the GPUs to be allocated for training, preferring the GPUs with most free VRAM. :param n_gpus: How many physical GPUs to allocate for this training process. Set to 0 to run on CPU. min_memory: How much free VRAM each physical GPU has to have. Too low value causes an error if the GPU runs out of memory when training. This prevents TensorFlow from allocating all of the memory on GPU to the process. split_into: How many logical GPUs to split each physical GPU into. This can speed up the training due to distributed training. Each physical GPU has to have min_memory * split_into VRAM available or an error is raised. :return None ''' if n_gpus == 0: environ['CUDA_VISIBLE_DEVICES'] = '' gpu_stats = GPUStatCollection.new_query() gpu_ids = map(lambda gpu: int(gpu.entry['index']), gpu_stats) gpu_freemem = map( lambda gpu: float(gpu.entry['memory.total'] - gpu.entry['memory.used'] ), gpu_stats) pairs = list(zip(gpu_ids, gpu_freemem)) valid_pairs = [ pair for pair in pairs if pair[1] >= min_vram * split_gpu_into ] if len(valid_pairs) < n_gpus: raise ValueError( f"Not enough valid GPUs detected. Check if the machine has at least {n_gpus} GPUs with at least {min_vram * split_gpu_into}MB free VRAM or set a lower --n_gpus value" ) sorted_indices = list(argsort([mem[1] for mem in valid_pairs]))[::-1] sorted_pairs = [valid_pairs[i] for i in sorted_indices] if n_gpus != 0: print( f"Setting {n_gpus} physical GPUs split into {n_gpus * split_gpu_into} logical GPUs with {min_vram}MB VRAM each for this training" ) else: print("Training on CPU") environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' devices = ",".join([str(pair[0]) for pair in sorted_pairs[:n_gpus]]) environ['CUDA_VISIBLE_DEVICES'] = devices if split_gpu_into > 1: physical_devices = tf_config.list_physical_devices('GPU') for device in physical_devices: tf_config.set_logical_device_configuration(device, [ tf_config.LogicalDeviceConfiguration(memory_limit=min_vram) for _ in range(split_gpu_into) ])
def get_gpu_list(): try: gpu_collection = GPUStatCollection.new_query() gpu_infos = [g.jsonify() for g in gpu_collection] gpu_json = { 'gpus': [{ 'name': g['name'], 'memory': float(g['memory.total']) / 1024 } for g in gpu_infos] } return gpu_json except: return {'gpus': []}
def log_gpu_state(collection): stat = GPUStatCollection.new_query().jsonify() if not len(stat['gpus']) > 0: logging.error('No gpus found') for gpu in stat['gpus']: hostname = stat['hostname'] query_time = stat['query_time'] index = gpu['index'] total_memory = gpu['memory.total'] utilization_memory = gpu['memory.used'] name = gpu['name'] power = gpu['power.draw'] temperature = gpu['temperature.gpu'] utilization_gpu = gpu['utilization.gpu'] process_list = gpu['processes'] active_user_list = [] for process in process_list: active_user_list.append(process['username']) collection.insert_one((create_gpu_state_dict(hostname, query_time, index, total_memory, utilization_memory, name, power, temperature, utilization_gpu, process_list, active_user_list)))
def __my_hardware_state(self, interval=1): stat = {"host_name": self.name, "is_alive": True, 'type': self.__type} # basic info try: stat['cpus'] = [{ "name": self.cpu_info.get('brand_raw', "CPU"), "usage": cpu } for cpu in psutil.cpu_percent(interval=interval, percpu=True)] stat['mem'] = { "total": bytes2MB(psutil.virtual_memory().total), "used": bytes2MB(psutil.virtual_memory().used) } stat['disk'] = [{ "total": bytes2MB(psutil.disk_usage('/').total), "used": bytes2MB(psutil.disk_usage('/').used) }] stat['net'] = { "in": bytes2MB(psutil.net_io_counters().bytes_recv), "out": bytes2MB(psutil.net_io_counters().bytes_sent) } except Exception as e: print({'error': '%s!' % getattr(e, 'message', str(e))}) stat['is_alive'] = False # gpu info stat['gpus'] = [] try: gpu_stat = GPUStatCollection.new_query().jsonify() stat['gpus'] = [{ "name": gpu.get('name', "CPU"), "usage": gpu.get('utilization.gpu'), "men_used": gpu.get('memory.used'), "men_total": gpu.get('memory.total'), "temp": gpu.get('temperature.gpu') } for gpu in gpu_stat['gpus']] except: pass self.status.append(stat)
from gpustat import GPUStatCollection from pprint import pprint stat = GPUStatCollection.new_query().jsonify() pprint(stat)
def process(self, config, coin): devices = {} ### scan for AMD metrics using rocm-smi try: sys.path.append('/opt/rocm/bin') rocm_smi = importlib.import_module('rocm_smi') for device in sorted(rocm_smi.listDevices()): clock = rocm_smi.getCurrentClock(device, 'mem', 'freq') if clock is None: continue clock = clock.replace('Mhz','') temp = rocm_smi.getSysfsValue(device, 'temp') power = rocm_smi.getSysfsValue(device, 'power').split('.')[0] vbios = rocm_smi.getSysfsValue(device, 'vbios') gpuid = rocm_smi.getSysfsValue(device, 'id') fanspeed = rocm_smi.getFanSpeed(device) devices['AMD'+device[4:]] = [ str(temp).replace('.0',''), power, clock, vbios, fanspeed, gpuid ] except ImportError as ex: if config.PLATFORM == 'AMD' or config.PLATFORM == 'BTH': print('ImportError: '+str(ex),file=sys.stderr) print(" Try 'sudo apt-get -y install rocm-amdgpu-pro'",file=sys.stderr) except OSError as ex: if config.VERBOSE: if str(ex) and str(ex).find('[Errno 2] No such file or directory') < 0: print(ex,file=sys.stderr) else: print("Cannot discover AMD devices, since 'rocm-smi' is not installed. See 'install/install-amd-pro' for instructions.",file=sys.stderr) ### Scan for Nvidia using gpustats.GPUStatCollection gpu_stats = [ ] try: from gpustat import GPUStatCollection gpu_stats = GPUStatCollection.new_query() if config.VERBOSE: print(str(len(gpu_stats))+" Nvidia devices found.") #except NVMLError_GpuIsLost as ex: except NVMLError as ex: if str(ex) != 'Driver Not Loaded': print('FAIL: '+str(ex), file=sys.stderr) elif ex.value == None or config.PLATFORM == 'NVI' or config.PLATFORM == 'BTH': pip = 'pip2' if six.PY3: pip = 'pip3' print("gpustat for Nvidia GPUs is not installed.\nUse '"+pip+" install gpustat' to install it.",file=sys.stderr) except: ex = sys.exc_info() print(ex,file=sys.stderr) idx = 0 for gpu in gpu_stats: devices['NVI'+str(gpu.index)] = gpu idx += 1 idxNVI = 0 total_nvi_watts = 0 total_amd_watts = 0 for device in sorted(devices): if 'AMD' in device: dev = devices[device] verbose = '' if dev[1]: power = int(dev[1]) total_amd_watts += power power = '%3iW '%(power) else: power = ' N/A ' if dev[2]: speed = '%4iMhz'%(int(dev[2])) else: speed = ' N/A ' if config.VERBOSE: verbose = ' ' + str(int(dev[4]))+'% ' + dev[3] + ' (' + dev[5] + ') ' print(device+' '+dev[0]+'C '+power+speed+verbose) else: uuid = '' if config.VERBOSE: uuid = devices[device].uuid watts = devices[device].power_draw if not watts: strWatts = ' N/A' # Some GPUs (looking at you GTX 750) do not return power level else: strWatts = "%3sW" % (watts) print("%s: %2sC %4s %s %s" % (device,devices[device].temperature,strWatts,devices[device].name,uuid)) if watts: total_nvi_watts += int(watts) idxNVI += 1 total_watts = total_nvi_watts + total_amd_watts if total_nvi_watts != 0 and total_nvi_watts != total_watts: print("TOTAL: "+str(total_nvi_watts)+' watts (NVI)') if total_amd_watts != 0 and total_amd_watts != total_watts: print("TOTAL: "+str(total_amd_watts)+' watts (AMD)') print("TOTAL: "+str(total_watts)+' watts') return config.ALL_MEANS_ONCE
def gpu_stats(): d = GPUStatCollection.new_query().jsonify() return jsonify(d)
def process(self, config, coin, quiet=False): global OverclockConfig # volatile means this operation make changes in settings VOLATILE = not config.DRYRUN and not config.QUERY postfix = '-' + coin['COIN'] if config.ALL_COINS: postfix = '' if not config.FORCE and not config.DRYRUN and status.get_status( None) and VOLATILE: if not config.QUICK and not quiet: print( "A miner is currently running, so we are skipping overclocking (use -f to force)." ) return config.ALL_MEANS_ONCE gpu_stats = [] try: gpu_stats = GPUStatCollection.new_query() except NameError as ex: print('NameError: Cannot load GPUStatCollection.') print(ex) print("To fix this, do 'pip3 install gpustat'.") if not config.DRYRUN: return config.ALL_MEANS_ONCE except: if not config.DRYRUN: if config.PLATFORM != 'AMD' and not quiet: print('Except: Cannot load GPUStatCollection on platform=' + config.PLATFORM) ex = sys.exc_info() print(ex) elif not config.QUICK and not quiet: ### TODO: https://github.com/GPUOpen-Tools/GPA/blob/master/BUILD.md print("'miners overclock' is not implemented for AMD devices") return config.ALL_MEANS_ONCE normalizedDevices = read_overclock_yml() sudo_nvidia_settings = get_sudo_nvidia_settings(config) xauthority = '~/.Xauthority' if sudo_nvidia_settings: xauthority = '/var/lib/lightdm/.Xauthority' settings = 'DISPLAY=:0 XAUTHORITY=' + xauthority + ' ' + sudo_nvidia_settings + 'nvidia-settings -c :0' nvidia_pwrs = {} oper = '-a' if config.QUERY: if config.VERBOSE: oper = '-q' else: oper = '--terse -q' for gpu in gpu_stats: if gpu.uuid in normalizedDevices: dev = normalizedDevices[gpu.uuid] oc = dev.get('OverClock', {}) # default undervolt (e.g. power-limit), oc = oc.get(coin['COIN'], oc.get('___', '0,150')) # unless a coin-specific one is given oc, uv = oc.split(',') # old-way, deprecated until we've migrated all into conf/overclock.yml, then will be removed elif gpu.uuid.upper() in config.SHEETS['Overclock']: dev = config.SHEETS['Overclock'][gpu.uuid.upper()] uv = dev['UV'] # default undervolt (or watts-limit) if 'UV' + postfix in dev: # unless a coin-specific one is given uv = dev['UV' + postfix] oc = dev['OC'] # default overclock if 'OC' + postfix in dev: # unless a coin-specific one is given oc = dev['OC' + postfix] if oc: settings += ' ' + oper + ' "[gpu:' + str( gpu.index) + ']/GPUMemoryTransferRateOffset[3]' if not config.QUERY: settings += '=' + str(int(oc)) settings += '"' if uv: iuv = int(uv) if iuv in nvidia_pwrs: nvidia_pwrs[iuv].append(str(gpu.index)) else: nvidia_pwrs[iuv] = [str(gpu.index)] overclock_dryrun = os.getenv('LOG_RAMDISK', '/var/local/ramdisk') + '/overclock-dryrun.sh' with open(overclock_dryrun, 'w') as fh: if not config.QUERY: fh.write("echo '%s %i %s'\n\n" % ('Overclocking', len(gpu_stats), 'GPUs.')) fh.write('%s\n' % ('sudo nvidia-smi -pm 1')) for pwr in nvidia_pwrs: if not config.QUERY: cmd = "sudo nvidia-smi -i " + ','.join( nvidia_pwrs[pwr]) + " -pl " + str(pwr) fh.write('%s\n' % (cmd)) fh.write("\n") if config.VERBOSE: print(cmd) fh.write(settings) fh.write("\n") if config.VERBOSE: print(settings) os.chmod( overclock_dryrun, stat.S_IXUSR | stat.S_IXGRP | stat.S_IWUSR | stat.S_IWGRP | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) if os.getenv('MINERS_USER'): os.chown(overclock_dryrun, getpwnam(os.getenv('MINERS_USER')).pw_uid, -1) if config.DRYRUN: print("\nexport DISPLAY=:0\nexport XAUTHORITY=" + xauthority + "\n") with open(overclock_dryrun, 'r') as fh: print(fh.read().replace('-a', " \\\n -a")) else: overclock_filename = os.getenv('LOG_RAMDISK', '/var/local/ramdisk') + '/overclock.sh' if VOLATILE and not config.FORCE and os.path.isfile( overclock_filename) and filecmp.cmp(overclock_dryrun, overclock_filename): if not config.QUICK and not config.QUERY: timestamp = time.ctime(os.path.getctime(overclock_filename)) print( "Overclock settings are identical to those already set at '" + timestamp + "', so we are keeping them (use -f to force).") else: os.rename(overclock_dryrun, overclock_filename) os.system("/bin/bash " + overclock_filename) if config.VERBOSE: with open(overclock_dryrun, 'r') as fh: print(fh.read()) if os.path.isfile(overclock_dryrun): os.remove(overclock_dryrun) return config.ALL_MEANS_ONCE