コード例 #1
0
ファイル: core.py プロジェクト: voidful/shows
    def __init__(self, name=None):
        self.name = name if name is not None else socket.gethostname()
        self.cpu_info = get_cpu_info()
        self.status = deque(maxlen=10000)
        self.__end = False
        try:
            GPUStatCollection.new_query().jsonify()
            self.__type = 'gpu'
        except:
            self.__type = 'cpu'

        self.__my_hardware_state(interval=0.1)
        self.__t = threading.Thread(target=self.__get_cpu_percent_loop)
        self.__t.start()
コード例 #2
0
ファイル: gpu.py プロジェクト: Gorilla-Lab-SCUT/gorilla-core
def get_free_gpu(mode="memory", memory_need=11000) -> list:
    r"""Get free gpu according to mode (process-free or memory-free).

    Args:
        mode (str, optional): memory-free or process-free. Defaults to "memory".
        memory_need (int): The memory you need, used if mode=='memory'. Defaults to 10000.

    Returns:
        list: free gpu ids
    """
    assert mode in [
        "memory", "process"
    ], "mode must be 'memory' or 'process', but got {}".format(mode)
    if mode == "memory":
        assert memory_need is not None, "'memory_need' if None, 'memory' mode must give the free memory you want to apply for"
        memory_need = int(memory_need)
        assert memory_need > 0, "'memory_need' you want must be positive"
    gpu_stats = GPUStatCollection.new_query()
    gpu_free_id_list = []

    for idx, gpu_stat in enumerate(gpu_stats):
        if gpu_check_condition(gpu_stat, mode, memory_need):
            gpu_free_id_list.append(idx)
            print("gpu[{}]: {}MB".format(idx, gpu_stat.memory_free))
    return gpu_free_id_list
コード例 #3
0
 def check(self, instance):
     try:
         gpu_stats = GPUStatCollection.new_query()
         for gpu in gpu_stats.gpus:
             entry = gpu.entry
             tags = ['gpu:{}'.format(entry['index'])]
             self.gauge('gpu.memory.used', entry['memory.used'], tags=tags)
             self.gauge('gpu.memory.total',
                        entry['memory.total'],
                        tags=tags)
             self.gauge('gpu.utilization',
                        entry['utilization.gpu'],
                        tags=tags)
             self.gauge('gpu.temperature',
                        entry['temperature.gpu'],
                        tags=tags)
             self.gauge('gpu.power.draw', entry['power.draw'], tags=tags)
             self.gauge('gpu.enforced.power.limit',
                        entry['enforced.power.limit'],
                        tags=tags)
     except Exception as ex:
         self.event({
             'timestamp': int(time.time()),
             'event_type': 'gpu_stat',
             'msg_title': 'Error in gpu stat',
             'msg_text': str(ex),
         })
コード例 #4
0
ファイル: core.py プロジェクト: fgaim/gpuview
def my_gpustat():
    """
    Returns a [safe] version of gpustat for this host.
        # See `--safe-zone` option of `gpuview start`.
        # Omit sensitive details, eg. uuid, username, and processes.
        # Set color flag based on gpu temperature:
            # bg-warning, bg-danger, bg-success, bg-primary

    Returns:
        dict: gpustat
    """

    try:
        from gpustat import GPUStatCollection
        stat = GPUStatCollection.new_query().jsonify()
        delete_list = []
        for gpu_id, gpu in enumerate(stat['gpus']):
            if type(gpu['processes']) is str:
                delete_list.append(gpu_id)
                continue
            gpu['memory'] = round(
                float(gpu['memory.used']) / float(gpu['memory.total']) * 100)
            if SAFE_ZONE:
                gpu['users'] = len(
                    set([p['username'] for p in gpu['processes']]))
                user_process = [
                    '%s(%s,%sM)' %
                    (p['username'], p['command'], p['gpu_memory_usage'])
                    for p in gpu['processes']
                ]
                gpu['user_processes'] = ' '.join(user_process)
            else:
                gpu['users'] = len(
                    set([p['username'] for p in gpu['processes']]))
                processes = len(gpu['processes'])
                gpu['user_processes'] = '%s/%s' % (gpu['users'], processes)
                gpu.pop('processes', None)
                gpu.pop("uuid", None)
                gpu.pop("query_time", None)

            gpu['flag'] = 'bg-primary'
            if gpu['temperature.gpu'] > 75:
                gpu['flag'] = 'bg-danger'
            elif gpu['temperature.gpu'] > 50:
                gpu['flag'] = 'bg-warning'
            elif gpu['temperature.gpu'] > 25:
                gpu['flag'] = 'bg-success'

        if delete_list:
            for gpu_id in delete_list:
                stat['gpus'].pop(gpu_id)

        return stat
    except Exception as e:
        return {'error': '%s!' % getattr(e, 'message', str(e))}
コード例 #5
0
ファイル: __main__.py プロジェクト: jolibrain/gpustat_server
def gpustat_server():
    stats = GPUStatCollection.new_query()
    rep = Response(json.dumps(stats.jsonify(), default=date_handler),
                   mimetype='application/json')
    rep.headers = {
        **rep.headers,
        **{
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET'
        }
    }
    return rep
コード例 #6
0
def set_gpus(n_gpus=gpu_settings["n_gpus"],
             min_vram=gpu_settings["min_vram"],
             split_gpu_into=gpu_settings["split_gpu_into"]):
    '''
    Configures the GPUs to be allocated for training, preferring the GPUs with most free VRAM.

    :param
        n_gpus: How many physical GPUs to allocate for this training process. Set to 0 to run on CPU.
        min_memory: How much free VRAM each physical GPU has to have. Too low value causes an error if the GPU runs out of memory when training.
                    This prevents TensorFlow from allocating all of the memory on GPU to the process.
        split_into: How many logical GPUs to split each physical GPU into. This can speed up the training due to distributed training.
                    Each physical GPU has to have min_memory * split_into VRAM available or an error is raised.

    :return
        None
    '''

    if n_gpus == 0:
        environ['CUDA_VISIBLE_DEVICES'] = ''
    gpu_stats = GPUStatCollection.new_query()
    gpu_ids = map(lambda gpu: int(gpu.entry['index']), gpu_stats)
    gpu_freemem = map(
        lambda gpu: float(gpu.entry['memory.total'] - gpu.entry['memory.used']
                          ), gpu_stats)
    pairs = list(zip(gpu_ids, gpu_freemem))
    valid_pairs = [
        pair for pair in pairs if pair[1] >= min_vram * split_gpu_into
    ]

    if len(valid_pairs) < n_gpus:
        raise ValueError(
            f"Not enough valid GPUs detected. Check if the machine has at least {n_gpus} GPUs with at least {min_vram * split_gpu_into}MB free VRAM or set a lower --n_gpus value"
        )

    sorted_indices = list(argsort([mem[1] for mem in valid_pairs]))[::-1]
    sorted_pairs = [valid_pairs[i] for i in sorted_indices]
    if n_gpus != 0:
        print(
            f"Setting {n_gpus} physical GPUs split into {n_gpus * split_gpu_into} logical GPUs with {min_vram}MB VRAM each for this training"
        )
    else:
        print("Training on CPU")
    environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    devices = ",".join([str(pair[0]) for pair in sorted_pairs[:n_gpus]])
    environ['CUDA_VISIBLE_DEVICES'] = devices
    if split_gpu_into > 1:
        physical_devices = tf_config.list_physical_devices('GPU')
        for device in physical_devices:
            tf_config.set_logical_device_configuration(device, [
                tf_config.LogicalDeviceConfiguration(memory_limit=min_vram)
                for _ in range(split_gpu_into)
            ])
コード例 #7
0
ファイル: client.py プロジェクト: EntilZha/kuro
def get_gpu_list():
    try:
        gpu_collection = GPUStatCollection.new_query()
        gpu_infos = [g.jsonify() for g in gpu_collection]
        gpu_json = {
            'gpus': [{
                'name': g['name'],
                'memory': float(g['memory.total']) / 1024
            } for g in gpu_infos]
        }
        return gpu_json
    except:
        return {'gpus': []}
コード例 #8
0
def log_gpu_state(collection):
	stat = GPUStatCollection.new_query().jsonify()
	if not len(stat['gpus']) > 0:
		logging.error('No gpus found')
	for gpu in stat['gpus']:
		hostname = stat['hostname']
		query_time = stat['query_time']
		index = gpu['index']
		total_memory = gpu['memory.total']
		utilization_memory = gpu['memory.used']
		name = gpu['name']
		power = gpu['power.draw']
		temperature = gpu['temperature.gpu']
		utilization_gpu = gpu['utilization.gpu']
		process_list = gpu['processes']
		active_user_list = []
		for process in process_list:
			active_user_list.append(process['username'])
		collection.insert_one((create_gpu_state_dict(hostname, query_time, index, total_memory, utilization_memory, name, power, temperature, utilization_gpu, process_list, active_user_list)))
コード例 #9
0
ファイル: core.py プロジェクト: voidful/shows
    def __my_hardware_state(self, interval=1):
        stat = {"host_name": self.name, "is_alive": True, 'type': self.__type}
        # basic info
        try:
            stat['cpus'] = [{
                "name": self.cpu_info.get('brand_raw', "CPU"),
                "usage": cpu
            } for cpu in psutil.cpu_percent(interval=interval, percpu=True)]
            stat['mem'] = {
                "total": bytes2MB(psutil.virtual_memory().total),
                "used": bytes2MB(psutil.virtual_memory().used)
            }
            stat['disk'] = [{
                "total": bytes2MB(psutil.disk_usage('/').total),
                "used": bytes2MB(psutil.disk_usage('/').used)
            }]
            stat['net'] = {
                "in": bytes2MB(psutil.net_io_counters().bytes_recv),
                "out": bytes2MB(psutil.net_io_counters().bytes_sent)
            }
        except Exception as e:
            print({'error': '%s!' % getattr(e, 'message', str(e))})
            stat['is_alive'] = False

        # gpu info
        stat['gpus'] = []
        try:
            gpu_stat = GPUStatCollection.new_query().jsonify()
            stat['gpus'] = [{
                "name": gpu.get('name', "CPU"),
                "usage": gpu.get('utilization.gpu'),
                "men_used": gpu.get('memory.used'),
                "men_total": gpu.get('memory.total'),
                "temp": gpu.get('temperature.gpu')
            } for gpu in gpu_stat['gpus']]
        except:
            pass
        self.status.append(stat)
コード例 #10
0
from gpustat import GPUStatCollection
from pprint import pprint

stat = GPUStatCollection.new_query().jsonify()

pprint(stat)
コード例 #11
0
ファイル: devices.py プロジェクト: GlennWood/mining
def process(self, config, coin):

    devices = {}

    ### scan for AMD metrics using rocm-smi
    try:
        sys.path.append('/opt/rocm/bin')
        rocm_smi = importlib.import_module('rocm_smi')
        for device in sorted(rocm_smi.listDevices()):
            clock = rocm_smi.getCurrentClock(device, 'mem', 'freq')
            if clock is None:
                continue
            clock = clock.replace('Mhz','')
            temp = rocm_smi.getSysfsValue(device, 'temp')
            power = rocm_smi.getSysfsValue(device, 'power').split('.')[0]
            vbios = rocm_smi.getSysfsValue(device, 'vbios')
            gpuid = rocm_smi.getSysfsValue(device, 'id')
            fanspeed = rocm_smi.getFanSpeed(device)
            devices['AMD'+device[4:]] = [ str(temp).replace('.0',''), power, clock, vbios, fanspeed, gpuid ]
    except ImportError as ex:
        if config.PLATFORM == 'AMD' or config.PLATFORM == 'BTH':
            print('ImportError: '+str(ex),file=sys.stderr)
            print("             Try 'sudo apt-get -y install rocm-amdgpu-pro'",file=sys.stderr)
    except OSError as ex:
        if config.VERBOSE:
            if str(ex) and str(ex).find('[Errno 2] No such file or directory') < 0:
                print(ex,file=sys.stderr)
            else:
                print("Cannot discover AMD devices, since 'rocm-smi' is not installed. See 'install/install-amd-pro' for instructions.",file=sys.stderr)

    ### Scan for Nvidia using gpustats.GPUStatCollection
    gpu_stats = [ ]
    try:
        from gpustat import GPUStatCollection
        gpu_stats = GPUStatCollection.new_query()
        if config.VERBOSE:
            print(str(len(gpu_stats))+" Nvidia devices found.")
    #except NVMLError_GpuIsLost as ex:
    except NVMLError as ex:
        if str(ex) != 'Driver Not Loaded':
            print('FAIL: '+str(ex), file=sys.stderr)   
        elif ex.value == None or config.PLATFORM == 'NVI' or config.PLATFORM == 'BTH':
            pip = 'pip2'
            if six.PY3: pip = 'pip3'
            print("gpustat for Nvidia GPUs is not installed.\nUse '"+pip+" install gpustat' to install it.",file=sys.stderr)
    except:
        ex = sys.exc_info()
        print(ex,file=sys.stderr)

    idx = 0
    for gpu in gpu_stats:
        devices['NVI'+str(gpu.index)] = gpu
        idx += 1
    
    idxNVI = 0
    total_nvi_watts = 0
    total_amd_watts = 0
    for device in sorted(devices):
        if 'AMD' in device:
            dev = devices[device]
            verbose = ''
            if dev[1]:
                power = int(dev[1])
                total_amd_watts += power
                power = '%3iW '%(power)
            else:
                power = ' N/A '
            if dev[2]:
                speed = '%4iMhz'%(int(dev[2]))
            else:
                speed = '  N/A '
            if config.VERBOSE:
                verbose = ' ' + str(int(dev[4]))+'% ' + dev[3] + ' (' + dev[5] + ') '
            print(device+' '+dev[0]+'C '+power+speed+verbose)
        else:
            uuid = ''
            if config.VERBOSE:
                uuid = devices[device].uuid
            watts = devices[device].power_draw 
            if not watts: 
                strWatts = ' N/A' # Some GPUs (looking at you GTX 750) do not return power level
            else:
                strWatts = "%3sW" % (watts)
            print("%s: %2sC %4s %s %s" % (device,devices[device].temperature,strWatts,devices[device].name,uuid))
            if watts: 
                total_nvi_watts += int(watts)
            idxNVI += 1
    total_watts = total_nvi_watts + total_amd_watts
    if total_nvi_watts != 0 and total_nvi_watts != total_watts: print("TOTAL: "+str(total_nvi_watts)+' watts (NVI)')
    if total_amd_watts != 0 and total_amd_watts != total_watts: print("TOTAL: "+str(total_amd_watts)+' watts (AMD)')
    print("TOTAL: "+str(total_watts)+' watts')

    return config.ALL_MEANS_ONCE
コード例 #12
0
def gpu_stats():
    d = GPUStatCollection.new_query().jsonify()
    return jsonify(d)
コード例 #13
0
def process(self, config, coin, quiet=False):
    global OverclockConfig
    # volatile means this operation make changes in settings
    VOLATILE = not config.DRYRUN and not config.QUERY

    postfix = '-' + coin['COIN']
    if config.ALL_COINS: postfix = ''

    if not config.FORCE and not config.DRYRUN and status.get_status(
            None) and VOLATILE:
        if not config.QUICK and not quiet:
            print(
                "A miner is currently running, so we are skipping overclocking (use -f to force)."
            )
        return config.ALL_MEANS_ONCE

    gpu_stats = []
    try:
        gpu_stats = GPUStatCollection.new_query()
    except NameError as ex:
        print('NameError: Cannot load GPUStatCollection.')
        print(ex)
        print("To fix this, do 'pip3 install gpustat'.")
        if not config.DRYRUN:
            return config.ALL_MEANS_ONCE

    except:
        if not config.DRYRUN:
            if config.PLATFORM != 'AMD' and not quiet:
                print('Except: Cannot load GPUStatCollection on platform=' +
                      config.PLATFORM)
                ex = sys.exc_info()
                print(ex)
            elif not config.QUICK and not quiet:
                ### TODO: https://github.com/GPUOpen-Tools/GPA/blob/master/BUILD.md
                print("'miners overclock' is not implemented for AMD devices")
            return config.ALL_MEANS_ONCE

    normalizedDevices = read_overclock_yml()
    sudo_nvidia_settings = get_sudo_nvidia_settings(config)

    xauthority = '~/.Xauthority'
    if sudo_nvidia_settings: xauthority = '/var/lib/lightdm/.Xauthority'
    settings = 'DISPLAY=:0 XAUTHORITY=' + xauthority + ' ' + sudo_nvidia_settings + 'nvidia-settings -c :0'
    nvidia_pwrs = {}
    oper = '-a'
    if config.QUERY:
        if config.VERBOSE:
            oper = '-q'
        else:
            oper = '--terse -q'

    for gpu in gpu_stats:
        if gpu.uuid in normalizedDevices:
            dev = normalizedDevices[gpu.uuid]
            oc = dev.get('OverClock',
                         {})  # default undervolt (e.g. power-limit),
            oc = oc.get(coin['COIN'],
                        oc.get('___',
                               '0,150'))  # unless a coin-specific one is given
            oc, uv = oc.split(',')

        # old-way, deprecated until we've migrated all into conf/overclock.yml, then will be removed
        elif gpu.uuid.upper() in config.SHEETS['Overclock']:
            dev = config.SHEETS['Overclock'][gpu.uuid.upper()]
            uv = dev['UV']  # default undervolt (or watts-limit)
            if 'UV' + postfix in dev:  # unless a coin-specific one is given
                uv = dev['UV' + postfix]
            oc = dev['OC']  # default overclock
            if 'OC' + postfix in dev:  # unless a coin-specific one is given
                oc = dev['OC' + postfix]

        if oc:
            settings += ' ' + oper + ' "[gpu:' + str(
                gpu.index) + ']/GPUMemoryTransferRateOffset[3]'
            if not config.QUERY: settings += '=' + str(int(oc))
            settings += '"'
        if uv:
            iuv = int(uv)
            if iuv in nvidia_pwrs:
                nvidia_pwrs[iuv].append(str(gpu.index))
            else:
                nvidia_pwrs[iuv] = [str(gpu.index)]

    overclock_dryrun = os.getenv('LOG_RAMDISK',
                                 '/var/local/ramdisk') + '/overclock-dryrun.sh'
    with open(overclock_dryrun, 'w') as fh:
        if not config.QUERY:
            fh.write("echo '%s %i %s'\n\n" %
                     ('Overclocking', len(gpu_stats), 'GPUs.'))
            fh.write('%s\n' % ('sudo nvidia-smi -pm 1'))
        for pwr in nvidia_pwrs:
            if not config.QUERY:
                cmd = "sudo nvidia-smi -i " + ','.join(
                    nvidia_pwrs[pwr]) + " -pl " + str(pwr)
                fh.write('%s\n' % (cmd))
                fh.write("\n")
                if config.VERBOSE: print(cmd)
        fh.write(settings)
        fh.write("\n")
        if config.VERBOSE: print(settings)
    os.chmod(
        overclock_dryrun, stat.S_IXUSR | stat.S_IXGRP | stat.S_IWUSR
        | stat.S_IWGRP | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
    if os.getenv('MINERS_USER'):
        os.chown(overclock_dryrun,
                 getpwnam(os.getenv('MINERS_USER')).pw_uid, -1)

    if config.DRYRUN:
        print("\nexport DISPLAY=:0\nexport XAUTHORITY=" + xauthority + "\n")
        with open(overclock_dryrun, 'r') as fh:
            print(fh.read().replace('-a', " \\\n    -a"))
    else:
        overclock_filename = os.getenv('LOG_RAMDISK',
                                       '/var/local/ramdisk') + '/overclock.sh'
        if VOLATILE and not config.FORCE and os.path.isfile(
                overclock_filename) and filecmp.cmp(overclock_dryrun,
                                                    overclock_filename):
            if not config.QUICK and not config.QUERY:
                timestamp = time.ctime(os.path.getctime(overclock_filename))
                print(
                    "Overclock settings are identical to those already set at '"
                    + timestamp +
                    "', so we are keeping them (use -f to force).")
        else:
            os.rename(overclock_dryrun, overclock_filename)
            os.system("/bin/bash " + overclock_filename)
        if config.VERBOSE:
            with open(overclock_dryrun, 'r') as fh:
                print(fh.read())

    if os.path.isfile(overclock_dryrun): os.remove(overclock_dryrun)
    return config.ALL_MEANS_ONCE