Beispiel #1
0
 def get(index):
     try:
         handle = pynvml.nvmlDeviceGetHandleByIndex(index)
     except pynvml.NVMLError_GpuIsLost:
         return None
     memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
     return dict(
         nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'),
         nvmlDeviceGetMemoryInfo=dict(
             total=memory_info.total,
             free=memory_info.free,
             used=memory_info.used,
         ),
         nvmlDeviceGetUtilizationRates=get_utilization_rates(handle),
         nvmlDeviceGetFanSpeed=get_fan_speed(handle),
         nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature(
             handle, pynvml.NVML_TEMPERATURE_GPU),
         nvmlDeviceGetTemperatureThreshold=dict(
             slowdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                 handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN),
             shutdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                 handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
         ),
         nvmlDeviceGetPowerManagementLimit=pynvml.
         nvmlDeviceGetPowerManagementLimit(handle),
         nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle),
     )
    def collect_metrics(self):
        """
    Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.)
    """
        data_list = []
        for gpu_num in range(nvmlDeviceGetCount()):
            handle = nvmlDeviceGetHandleByIndex(gpu_num)
            device_name = DEVICE_NAME_FORMAT % gpu_num
            power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0
            fan_speed = nvmlDeviceGetFanSpeed(handle)
            temperature = nvmlDeviceGetTemperature(handle,
                                                   NVML_TEMPERATURE_GPU)
            data_list.append({
                'measurement': device_name,
                'tags': {
                    'host': 'minar',
                    'gpu': device_name
                },
                'fields': {
                    'power_usage': power_usage,
                    'fan_speed': fan_speed,
                    'temperature': temperature
                }
            })
            time.sleep(PERIOD_SECS)

        return data_list
Beispiel #3
0
def _get_gpu_usage(gpu_count):
    import pynvml
    gpus = []
    for i in range(gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        try:
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            temp = pynvml.nvmlDeviceGetTemperature(handle,
                                                   pynvml.NVML_TEMPERATURE_GPU)
            try:
                power_usage = (
                    pynvml.nvmlDeviceGetPowerUsage(handle) /
                    1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) /
                               1000.0) * 100
            except pynvml.NVMLError as e:
                logger.error(
                    "Coudln't extract power usage due to NVML exception: {}".
                    format(str(e)))
                power_usage = -9999
            gpus.append(
                (handle, util.gpu, util.memory,
                 (memory.used / float(memory.total)) * 100, temp, power_usage))
        except pynvml.NVMLError as e:
            logger.error(
                "Coudln't extract gpu usage information due to NVML exception: {}"
                .format(str(e)))
            return None
    return gpus
Beispiel #4
0
def temperatures():
    ret = {}
    for i in range(nv.nvmlDeviceGetCount()):
        hdl = nv.nvmlDeviceGetHandleByIndex(i)
        temp = nv.nvmlDeviceGetTemperature(hdl, nv.NVML_TEMPERATURE_GPU)
        ret[i] = temp
    return ret
Beispiel #5
0
def device_status(device_index):
    handle = nv.nvmlDeviceGetHandleByIndex(device_index)
    device_name = nv.nvmlDeviceGetName(handle)
    device_name = device_name.decode('UTF-8')
    nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle)
    utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu
    clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM)
    temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU)
    pids = []
    users = []
    dates = []
    cmd = None
    for nv_proc in nv_procs:
        pid = nv_proc.pid
        pids.append(pid)
        try:
            proc = psutil.Process(pid)
            users.append(proc.username())
            dates.append(proc.create_time())
            if cmd is None:
                cmd = parse_cmd_roughly(proc.cmdline())
        except psutil.NoSuchProcess:
            users.append('?')
    return {
        'type': device_name,
        'is_available': len(pids) == 0,
        'pids': ','.join([str(pid) for pid in pids]),
        'users': ','.join(users),
        'running_since':
        arrow.get(min(dates)).humanize() if len(dates) > 0 else None,
        'utilization': utilization,
        'clock_mhz': clock_mhz,
        'temperature': temperature,
        'cmd': cmd,
    }
Beispiel #6
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #7
0
def getTemp(handle):
    try:
        temp = str(
            pynvml.nvmlDeviceGetTemperature(handle,
                                            pynvml.NVML_TEMPERATURE_GPU))
    except pynvml.NVMLError as err:
        temp = handleError(err)
        PUSH_TO_CW = False
    return temp
Beispiel #8
0
def get_temperature(handle):
    temp = -1
    try:
        temp = pynvml.nvmlDeviceGetTemperature(handle,
                                               pynvml.NVML_TEMPERATURE_GPU)
    except Exception:
        pass

    return temp
Beispiel #9
0
 def get_nvidia_gpu_temper(self, handle, sensor=0):
     '''
     获得英伟达GPU的温度
     :param handle:
     :param sensor:
     :return:
     '''
     temper = pynvml.nvmlDeviceGetTemperature(handle, sensor)
     return temper
Beispiel #10
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Beispiel #11
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Beispiel #12
0
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {
                    'gpu': util_gpu,
                    'memory': util_mem
                },
                'memory': {
                    'total': mem_total,
                    'free': mem_free,
                    'used': mem_used
                },
                'temperature': temperature,
                'power': {
                    'draw': power_draw,
                    'limit': power_limit
                }
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Beispiel #13
0
def get_temperature(handle):
    temp = -1
    try:
        temp = pynvml.nvmlDeviceGetTemperature(handle,
                                               pynvml.NVML_TEMPERATURE_GPU)
    except Exception:
        pass

    return temp
Beispiel #14
0
 def get_gpu_temp(self):
     try:
         gpu_handle = nvmlDeviceGetHandleByIndex(0)
         temp_data = nvmlDeviceGetTemperature(gpu_handle, 0)
         if temp_data:
             return temp_data
     except Exception:
         self.__logger.error("SensorsThread.get_cpu_temp Exception",
                             exc_info=True)
     return None
Beispiel #15
0
    def stats(self):
        stats = {}
        for i in range(0, self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = util.gpu
                stats["gpu.{}.{}".format(i, "memory")] = util.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used /
                                              float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu
                    stats["gpu.process.{}.{}".format(i,
                                                     "memory")] = util.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used /
                                                  float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(
                        handle) / 1000.0
                    power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                        handle) / 1000.0
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(
                            i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(
                            i, "powerPercent")] = power_usage

                except pynvml.NVMLError as err:
                    pass

            except pynvml.NVMLError as err:
                pass
        if psutil:
            #net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
        return stats
Beispiel #16
0
def get_gpu_status(gpu_index=0):
    # init for getting
    N.nvmlInit()
    handle = N.nvmlDeviceGetHandleByIndex(gpu_index)

    def _decode(b):
        if isinstance(b, bytes):
            return b.decode()  # to unicode
        return b

    name = _decode(N.nvmlDeviceGetName(handle))
    uuid = _decode(N.nvmlDeviceGetUUID(handle))

    try:
        temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
    except N.NVMLError:
        temperature = None

    try:
        memory = N.nvmlDeviceGetMemoryInfo(handle)
    except N.NVMLError:
        memory = None

    try:
        utilization = N.nvmlDeviceGetUtilizationRates(handle)
    except N.NVMLError:
        utilization = None

    try:
        power = N.nvmlDeviceGetPowerUsage(handle)
    except:
        power = None

    try:
        power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
    except:
        power_limit = None

    # real gpu index
    index = N.nvmlDeviceGetIndex(handle)
    gpu_info = {
        'index': index,
        'uuid': uuid,
        'name': name,
        'temperature': temperature,
        'utilization': utilization.gpu if utilization else None,
        'power': int(power / 1000) if power is not None else None,
        'enforced.power': int(power_limit / 1000) if power_limit is not None else None,
        # Convert bytes into MBytes
        'memory.used': int(memory.used / 1024 / 1024) if memory else None,
        'memory.total': int(memory.total / 1024 / 1024) if memory else None,
    }
    # release resource
    N.nvmlShutdown()
    return GPUStat(gpu_info)
Beispiel #17
0
def gpu_info(gpu_handle, i: int = 0) -> List[Dict[str, Any]]:
    power = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000
    temperature = pynvml.nvmlDeviceGetTemperature(gpu_handle,
                                                  pynvml.NVML_TEMPERATURE_GPU)
    free_memory = best_prefix(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).free)
    return [
        dict(full_text=f'GPU Power {power:.1f} W', name=f'gpu{i}_power'),
        dict(full_text=free_memory.format('GPU RAM {value:.1f} {unit}'),
             name=f'gpu{i}_free_memory'),
        dict(full_text=f'GPU Temp {temperature} ℃',
             name=f'gpu{i}_temperature'),
    ]
Beispiel #18
0
def get_heat_realtime(free_gpus):
    free_gpus_temp = []
    N.nvmlInit()
    for index in free_gpus:
        handle = N.nvmlDeviceGetHandleByIndex(index)
        try:
            temperature = float(
                N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU))
        except N.NVMLError:
            temperature = float('inf')
        free_gpus_temp += [temperature]
    return free_gpus_temp
Beispiel #19
0
 def get_temperature(self):
     if self.nvh == None:
         return None
     t = None
     try:
         t = nv.nvmlDeviceGetTemperature(self.nvh, nv.NVML_TEMPERATURE_GPU)
         self.temperature = t
     except:
         logger.error(
             f"{self.pci_dev.slot_name}/{self.name}] get temperature failed !!"
         )
     return t
Beispiel #20
0
    def report(self):
        self.loss_report /= self.update_decay_steps
        self.acc_report /= self.update_decay_steps

        for acc_step, loss_step in zip(self.acc_report.tolist(), self.loss_report.tolist()):
            self.stats.train_record(acc_step, loss_step)

        infos = list(nvmlDeviceGetMemoryInfo(handler) for handler in self.gpu_info_handler)
        tems = list(nvmlDeviceGetTemperature(handler, 0) for handler in self.gpu_info_handler)
        info_used = sum(info.used for info in infos)
        info_total = sum(info.total for info in infos)

        output_str = str.format(
            'Step: %6d, acc:%6.2f (%6.2f~%6.2f), loss:%5.2f (%5.2f~%5.2f), '
            'lr: %.4f, bc: %d/%d, bs: %5d, tks: %6d+%6d, t: %5.2f, m: %5.2f/%5.2f, tem: %sC'
            % (self.processed_steps,
               self.acc_report.mean() * 100,
               self.acc_report.min() * 100,
               self.acc_report.max() * 100,
               self.loss_report.mean(),
               self.loss_report.min(),
               self.loss_report.max(),
               self.lr,
               self.src_tokens.sum() + self.tgt_tokens.sum(),
               self.src_num_pad_tokens.sum() + self.tgt_num_pad_tokens.sum(),
               self.num_examples.sum(),
               self.src_tokens.sum(),
               self.tgt_tokens.sum(),
               self.time_sum,
               info_used / self.memory_unit,
               info_total / self.memory_unit,
               '/'.join(str(x) for x in tems))
        )

        self.stats.log_to_file(output_str)
        print(output_str)

        self.acc_report.fill(0)
        self.loss_report.fill(0)
        self.update_decay_steps.fill(0)
        self.src_tokens.fill(0)
        self.tgt_tokens.fill(0)
        self.src_num_pad_tokens.fill(0)
        self.tgt_num_pad_tokens.fill(0)
        self.num_examples.fill(0)
        self.time_sum = 0.0

        if max(info.used / info.total for info in infos) > self.gpu_memory_limit:
            torch.cuda.empty_cache()

        return
Beispiel #21
0
    def get_frame(self):
        # Get CPU and GPU usage percentages
        self.cpu_usage_percent = psutil.cpu_percent()
        self.gpu_usage_percent = nvml.nvmlDeviceGetUtilizationRates(
            self.selected_gpu).gpu

        # Create a deep copy of template to work on
        frame = deepcopy(self.template)

        # Draw progress bar for each usage
        def getUsageLineColor(usage):
            if usage < 30.:
                return self.GREEN_COLOR
            elif usage < 70.:
                return self.YELLOW_COLOR
            else:
                return self.RED_COLOR

        cpu_line_length_px = int(self.cpu_usage_percent / 100. *
                                 self.MAX_LINE_LENGTH_PX)
        gpu_line_length_px = int(self.gpu_usage_percent / 100. *
                                 self.MAX_LINE_LENGTH_PX)

        frame[1:(cpu_line_length_px + 1),
              0:self.LINE_WIDTH, :] = getUsageLineColor(self.cpu_usage_percent)
        frame[18:(gpu_line_length_px + 18),
              0:self.LINE_WIDTH, :] = getUsageLineColor(self.gpu_usage_percent)

        # Color CPU and GPU text according to temperature
        def getCPUGPUTextColor(temp):
            if temp < 45.:
                return self.GREEN_COLOR
            elif temp < 74.:
                return self.YELLOW_COLOR
            else:
                return self.RED_COLOR

        self.cpu_temperature = self._get_cpu_temperature()
        self.gpu_temperature = nvml.nvmlDeviceGetTemperature(
            self.selected_gpu, nvml.NVML_TEMPERATURE_GPU)

        IMAGE_CHANNELS_AXIS = -1  # Last axis
        frame[(frame == self.CPU_TEXT_COLOR_VALUE).all(
            axis=IMAGE_CHANNELS_AXIS)] = getCPUGPUTextColor(
                self.cpu_temperature)
        frame[(frame == self.GPU_TEXT_COLOR_VALUE).all(
            axis=IMAGE_CHANNELS_AXIS)] = getCPUGPUTextColor(
                self.gpu_temperature)

        return frame.flatten()
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Beispiel #23
0
def get_gpu_info(redis_con: redis.Redis, gpus_compat: List[int]) -> Tuple[List[dict], bool, bool]:
    """Returns information about GPUs

    :param redis_con: an instance of Redis connection
    :param gpus_compat: a list of GPUs that are compatible with TensorFlow
    :return: Tuple that contains [0] list of dictionaries with information about every GPU
                                 [1] boolean that states if the available GPUs for training changed
                                 [2] boolean that states if the available GPUs for inference changed
    """
    if not GPU_MODE:
        return [], False, False
    gpu_data = []
    gpu_ok_train = []
    gpu_ok_inf = []
    for i in range(pynvml.nvmlDeviceGetCount()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        try:
            util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
        except pynvml.NVMLError:
            util = -1
        try:
            mem_use = pynvml.nvmlDeviceGetMemoryInfo(handle).used
        except pynvml.NVMLError:
            mem_use = -1
        try:
            temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
        except pynvml.NVMLError:
            temp = -1
        mem_tot = int(redis_con.lindex(RKEY_HWINFO_GPU_MEM_AVAIL, i))
        if i in gpus_compat and mem_tot - mem_use >= int(os.environ['TRAINING_GPU_MIN_MEM']):
            gpu_ok_train.append(i)
        if i in gpus_compat and mem_tot - mem_use >= int(os.environ['INFERENCE_GPU_MIN_MEM']):
            gpu_ok_inf.append(i)
        gpu_data.append({
            'name': redis_con.lindex(RKEY_HWINFO_GPU_NAME, i),
            'index': i,
            'tf_compatibility': i in gpus_compat,
            'utilization': util,
            'memory': {
                'total': mem_tot,
                'used': mem_use
            },
            'temperature': temp
        })
    gpus_train_change = db_update_available_gpus(redis_con, os.environ['REDIS_KEY_TRAINING_GPUS_OK'], gpu_ok_train)
    gpus_inf_change = db_update_available_gpus(redis_con, os.environ['REDIS_KEY_INFERENCE_GPUS_OK'], gpu_ok_inf)
    return gpu_data, gpus_train_change, gpus_inf_change
Beispiel #24
0
 def run(self):
     while True:
         temp = str(
             pynvml.nvmlDeviceGetTemperature(self.handle,
                                             pynvml.NVML_TEMPERATURE_GPU))
         name = str(pynvml.nvmlDeviceGetName(self.handle))
         meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
         total = str(round(meminfo.total / 1024**3, 2))
         used = str(round(meminfo.used / 1024**3, 2))
         free = str(round(meminfo.free / 1024**3, 2))
         data = [{
             'temp': temp,
             'name': name,
             'total': total,
             'used': used,
             'free': free
         }]
         self.update_data.emit(data)
         time.sleep(1)
def get_gpu_stat(handle):
	ret = {}

	# get temperature
	try:
		ret['temp'] = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
	except:
		ret['temp'] = None

	# get power usage
	try:
		ret['watt'] = N.nvmlDeviceGetPowerUsage(handle) / 1000
	except:
		ret['watt'] = None

	ret['fan'] = 0

	# return information gathered
	#print("temp: {0}, watt: {1}".format(ret['temp'], ret['watt']))
	return ret
Beispiel #26
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total':
                    memoryInfo.total / 1024 / 1024,
                    'memory.used':
                    memoryInfo.total / 1024 / 1024,
                    'memory.free':
                    memoryInfo.free / 1024 / 1024,
                    'utilization.gpu':
                    utilizationRates.gpu,
                    'utilization.memory':
                    utilizationRates.memory,
                    'temperature.gpu':
                    pynvml.nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index), stat_name=stat_name)
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Beispiel #27
0
    def check(self, instance):
        try:
            pynvml.nvmlInit()
            deviceCount = pynvml.nvmlDeviceGetCount()
            for device_id in xrange(deviceCount):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
                name = pynvml.nvmlDeviceGetName(handle)
                tags = dict(name="{}-{}".format(name, device_id))
                d_tags = self._dict2list(tags)
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                # utilization info
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
                # memory info
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
                # temperature info
                self.gauge('nvml.temp.', temp, tags=d_tags)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            status = AgentCheck.OK
            msg = u'Ok'
        except:
            status = AgentCheck.CRITICAL
            msg = u'Error'
        finally:
            pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #28
0
 def GpuInfo(self):
     """
     获取GPU使用情况
     :return: GPU使用情况
     """
     gpuInfoList = list()
     for gpuIndex, gpuHandle in enumerate(self.__gpuHandleList):
         gpuPercent = pynvml.nvmlDeviceGetUtilizationRates(gpuHandle).gpu
         memTotal = format(
             pynvml.nvmlDeviceGetMemoryInfo(gpuHandle).total / self.MB,
             ".2f")
         memUsed = format(
             pynvml.nvmlDeviceGetMemoryInfo(gpuHandle).used / self.MB,
             ".2f")
         temp = pynvml.nvmlDeviceGetTemperature(gpuHandle,
                                                pynvml.NVML_TEMPERATURE_GPU)
         gpuInfoList.append(
             dict(Index=gpuIndex,
                  Percent=gpuPercent,
                  TotalMem=memTotal,
                  UsedMem=memUsed,
                  Temp=temp))
     return gpuInfoList
Beispiel #29
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Beispiel #30
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info
Beispiel #31
0
def get_gpu_pid_info():
    """Retrieves the process IDs of processes running on the GPU."""

    gpus = []
    device_count = -1

    try:
        nvmlInit()

        device_count = nvmlDeviceGetCount()

        gpus = [{}] * device_count

        for i in range(device_count):
            gpus[i] = {'id': i}
            handle = nvmlDeviceGetHandleByIndex(i)
            device_name = nvmlDeviceGetName(handle)

            gpus[i]['name'] = device_name

            try:
                util = nvmlDeviceGetUtilizationRates(handle)
                gpus[i]['utilization'] = util.gpu
            except NVMLError as err:
                print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                mem_info = nvmlDeviceGetMemoryInfo(handle)
                gpus[i]['mem_total'] = mem_info.total
                gpus[i]['mem_used'] = mem_info.used
            except NVMLError as err:
                print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                fan_speed = nvmlDeviceGetFanSpeed(handle)
                gpus[i]['fan_speed'] = fan_speed
            except NVMLError as err:
                print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr)

            try:
                temp = nvmlDeviceGetTemperature(handle, 0)
                gpus[i]['temp'] = temp
            except NVMLError as err:
                print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr)

            try:
                power_usage = nvmlDeviceGetPowerUsage(handle)
                gpus[i]['power_usage'] = round(power_usage / 1000.)
            except NVMLError as err:
                print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr)

            try:
                power_limit = nvmlDeviceGetEnforcedPowerLimit(handle)
                gpus[i]['power_limit'] = round(power_limit / 1000.)
            except NVMLError as err:
                print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr)

            gpus[i]['processes'] = []

            try:
                processes = nvmlDeviceGetComputeRunningProcesses(handle)

                for process in processes:
                    process_name = nvmlSystemGetProcessName(process.pid).decode()
                    gpus[i]['processes'].append({'pid': process.pid, 'name': process_name})

            except NVMLError as err:
                print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr)

    except NVMLError as err:
        print(f'Error while reading GPU information: {err}', file=sys.stderr)

    nvmlShutdown()

    return gpus, device_count
Beispiel #32
0
def _get_full_status_nvml():
    devices_status = []
    devices_full_status = []
    for handle in _static_info['private']['gpu']['handles']:
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        devices_status.append({
            'utilization': {
                'gpu': util.gpu,
                'memory': util.memory
            },
            'memory': {
                'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0
            },
            'processes': len(process_info)
        })
        with _process_info_lock:
            process_list = []
            for p in process_info:
                info = _process_info[p.pid]
                info['gpu_memory'] = p.usedGpuMemory
                process_list.append(info)
        process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True)
        full_status = {
            'memory': {
                'free': mem_info.free,
                'used': mem_info.used
            },
            'process_list': process_list
        }
        try:
            full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['temperature'] = pynvml.nvmlDeviceGetTemperature(
                handle, pynvml.NVML_TEMPERATURE_GPU)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState(
                handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['power'] = {
                'usage': pynvml.nvmlDeviceGetPowerUsage(handle),
                'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle)
            }
        except pynvml.NVMLError_NotSupported:
            pass
        devices_full_status.append(full_status)
    status = {
        'basic': {
            'devices': devices_status
        },
        'full': {
            'devices': devices_full_status
        }
    }
    return status
Beispiel #33
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        gpus_in_use = 0
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
                gpus_in_use += 1 if util_rate.memory > 50.0 else 0
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                self.gauge('nvml.process.count', len(cps), d_tags)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags['puser'] = self.get_process_owner(ps.pid)
                    docker_name, docker_image = self.get_container_name(ps.pid)
                    p_tags['docker_image'] = docker_image
                    p_tags['docker_name'] = docker_name
                    p_tags = self._dict2list(p_tags)
                    print p_tags
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        self.gauge('nvml.gpus_in_use_count', gpus_in_use)
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #34
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Beispiel #35
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info
Beispiel #36
0
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%"+"(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Beispiel #37
0
 def info_refresh(self):
     
     try:
         stat = open("/proc/stat")
         self.statlines = stat.read().splitlines()[1:-1]
         stat.close()
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         for j in self.statlines[i].split()[1:]: #remove cpu#
            self.total[i]+= int(j)
         self.idle[i] = int(self.statlines[i].split()[4])
     
     for i in range(self.corecount):
         if (self.total[i] - self.prev_total[i]) == 0:
             self.prev_idle[i] = self.idle[i]
             self.prev_total[i] = self.total[i]
             break
         
         self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) )
         self.prev_idle[i] = self.idle[i]
         self.prev_total[i] = self.total[i]
         self.idle[i] = 0
         self.total[i] = 0
     
     for i in range(self.deviceCount):
         
         util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i])
         temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU)
         memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i])
         (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i])
         (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i])
         
         mem_total = memInfo.total / 1024 / 1024
         mem_used = memInfo.used / 1024 / 1024
         
         self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu)
         self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100)
         ########
         self.util_history.append(util.gpu)
         self.util_graph.queue_draw()
         
         self.temp_history.append(temp)
         self.temp_graph.queue_draw()
         ########
         self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory)
         self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100)
         
         self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util)
         self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util)
         self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100)
         self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100)
         
         self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total))
         self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total)
         
         self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp)
         if temp > 100:
            temp = 100
         elif temp < 0:
             temp = 0
         self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100)
         
         
     #--proc--
     procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0])
     
     proc_liststore = Gtk.ListStore(int, str, int)
     
     for p in procs:
         pid = p.pid
         try:
             path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8')
         except:
             self.exit()
         if (p.usedGpuMemory == None):
             mem = 0
         else:
             mem = (p.usedGpuMemory / 1024 / 1024)
         proc_liststore.append([pid, path, mem])
     self.tree.set_model(proc_liststore)
     return True
Beispiel #38
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Beispiel #39
0
    def _get_data(self):
        data = {}

        if self.deviceCount:
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)
                brand = pynvml.nvmlDeviceGetBrand(handle)

                ### Get data ###
                ## Memory usage
                try:
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                except Exception as e:
                    self.debug(str(e))
                    mem = None

                ## ECC errors
                try:
                    _memError = {}
                    _eccCounter = {}
                    eccErrors = {}
                    eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
                    memErrorType = [
                        'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'
                    ]
                    memoryLocationType = [
                        'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY',
                        'REGISTER_FILE', 'TEXTURE_MEMORY'
                    ]
                    for memoryLocation in range(5):
                        for eccCounter in range(2):
                            for memError in range(2):
                                _memError[memErrorType[
                                    memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(
                                        handle, memError, eccCounter,
                                        memoryLocation)
                            _eccCounter[eccCounterType[eccCounter]] = _memError
                        eccErrors[
                            memoryLocationType[memoryLocation]] = _eccCounter
                except Exception as e:
                    self.debug(str(e))
                    eccErrors = None

                ## Temperature
                try:
                    temp = pynvml.nvmlDeviceGetTemperature(
                        handle, pynvml.NVML_TEMPERATURE_GPU)
                except Exception as e:
                    self.debug(str(e))
                    temp = None

                ## Fan
                try:
                    fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
                except Exception as e:
                    self.debug(str(e))
                    fanspeed = None

                ## GPU and Memory Utilization
                try:
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_util = util.gpu
                    mem_util = util.memory
                except Exception as e:
                    self.debug(str(e))
                    gpu_util = None
                    mem_util = None

                ## Encoder Utilization
                try:
                    encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                    enc_util = encoder[0]
                except Exception as e:
                    self.debug(str(e))
                    enc_util = None

                ## Decoder Utilization
                try:
                    decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                    dec_util = decoder[0]
                except Exception as e:
                    self.debug(str(e))
                    dec_util = None

                ## Clock frequencies
                try:
                    clock_core = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_GRAPHICS)
                    clock_sm = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_SM)
                    clock_mem = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
                except Exception as e:
                    self.debug(str(e))
                    clock_core = None
                    clock_sm = None
                    clock_mem = None

                ### Packing data ###
                self.debug("Device", gpuIdx, ":", str(name))
                data["device_name_" + gpuIdx] = name

                self.debug("Brand:", str(brand))

                self.debug(str(name), "Temp      :", str(temp))
                data["device_temp_" + gpuIdx] = temp

                self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
                data["device_mem_total_" + gpuIdx] = mem.total

                self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
                data["device_mem_used_" + gpuIdx] = mem.used

                self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
                data["device_mem_free_" + gpuIdx] = mem.free

                self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
                data["device_load_gpu_" + gpuIdx] = gpu_util

                self.debug(str(name), "Load MEM  :", str(mem_util), '%')
                data["device_load_mem_" + gpuIdx] = mem_util

                self.debug(str(name), "Load ENC  :", str(enc_util), '%')
                data["device_load_enc_" + gpuIdx] = enc_util

                self.debug(str(name), "Load DEC  :", str(dec_util), '%')
                data["device_load_dec_" + gpuIdx] = dec_util

                self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
                data["device_core_clock_" + gpuIdx] = clock_core

                self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
                data["device_sm_clock_" + gpuIdx] = clock_sm

                self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
                data["device_mem_clock_" + gpuIdx] = clock_mem

                self.debug(str(name), "Fan speed :", str(fanspeed), '%')
                data["device_fanspeed_" + gpuIdx] = fanspeed

                self.debug(str(name), "ECC errors:", str(eccErrors))
                if eccErrors is not None:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                else:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = None

        ## Get unit (S-class Nvidia cards) data
        if self.unitCount:
            for i in range(self.unitCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlUnitGetHandleByIndex(i)

                try:
                    fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
                    fan_speed = fan.speed  # Fan speed (RPM)
                    fan_state = fan.state  # Flag that indicates whether fan is working properly
                except Exception as e:
                    self.debug(str(e))
                    fan_speed = None
                    fan_state = None

                try:
                    psu = pynvml.nvmlUnitGetPsuInfo(handle)
                    psu_current = psu.current  # PSU current (A)
                    psu_power = psu.power  # PSU power draw (W)
                    psu_state = psu.state  # The power supply state
                    psu_voltage = psu.voltage  # PSU voltage (V)
                except Exception as e:
                    self.debug(str(e))
                    psu_current = None
                    psu_power = None
                    psu_state = None
                    psu_voltage = None

                try:
                    temp_intake = pynvml.nvmlUnitGetTemperature(
                        handle, 0)  # Temperature at intake in C
                    temp_exhaust = pynvml.nvmlUnitGetTemperature(
                        handle, 1)  # Temperature at exhaust in C
                    temp_board = pynvml.nvmlUnitGetTemperature(
                        handle, 2)  # Temperature on board in C
                except Exception as e:
                    self.debug(str(e))
                    temp_intake = None
                    temp_exhaust = None
                    temp_board = None

                self.debug('Unit fan speed:', str(fan_speed))
                data["unit_fan_speed_" + gpuIdx] = fan_speed

                self.debug('Unit fan state:', str(fan_state))
                data["unit_fan_state_" + gpuIdx] = fan_state

                self.debug('Unit PSU current:', str(psu_current))
                data["unit_psu_current_" + gpuIdx] = psu_current

                self.debug('Unit PSU power:', str(psu_power))
                data["unit_psu_power_" + gpuIdx] = psu_power

                self.debug('Unit PSU state:', str(psu_state))
                data["unit_psu_state_" + gpuIdx] = psu_state

                self.debug('Unit PSU voltage:', str(psu_voltage))
                data["unit_psu_voltage_" + gpuIdx] = psu_voltage

                self.debug('Unit temp intake:', str(temp_intake))
                data["unit_temp_intake_" + gpuIdx] = temp_intake

                self.debug('Unit temp exhaust:', str(temp_exhaust))
                data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

                self.debug('Unit temp board:', str(temp_board))
                data["unit_temp_board_" + gpuIdx] = temp_board

        ## Get data via legacy mode
        if self.legacy:
            try:
                output, error = Popen([
                    "nvidia-settings", "-c", ":0", "-q", "GPUUtilization",
                    "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q",
                    "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory"
                ],
                                      shell=False,
                                      stdout=PIPE,
                                      stderr=PIPE).communicate()
                output = repr(str(output))
                if len(output) < 800:
                    raise Exception(
                        'Error in fetching data from nvidia-settings ' +
                        output)
                self.debug(str(error), output)
            except Exception as e:
                self.error(str(e))
                self.error('Setting legacy mode to False')
                self.legacy = False
                return data
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                if data["device_temp_" + gpuIdx] is None:
                    coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)',
                                       output)[i][1]
                    try:
                        data["device_temp_" + gpuIdx] = int(coreTemp)
                        self.debug('Using legacy temp for GPU {0}: {1}'.format(
                            gpuIdx, coreTemp))
                    except Exception as e:
                        self.debug(str(e), "skipping device_temp_" + gpuIdx)
                if data["device_mem_used_" + gpuIdx] is None:
                    memUsed = findall(
                        'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)',
                        output)[i][1]
                    try:
                        data["device_mem_used_" + gpuIdx] = int(memUsed)
                        self.debug(
                            'Using legacy mem_used for GPU {0}: {1}'.format(
                                gpuIdx, memUsed))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_used_" + gpuIdx)
                if data["device_load_gpu_" + gpuIdx] is None:
                    gpu_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][1]
                    try:
                        data["device_load_gpu_" + gpuIdx] = int(gpu_util)
                        self.debug(
                            'Using legacy load_gpu for GPU {0}: {1}'.format(
                                gpuIdx, gpu_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_gpu_" + gpuIdx)
                if data["device_load_mem_" + gpuIdx] is None:
                    mem_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][2]
                    try:
                        data["device_load_mem_" + gpuIdx] = int(mem_util)
                        self.debug(
                            'Using legacy load_mem for GPU {0}: {1}'.format(
                                gpuIdx, mem_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_mem_" + gpuIdx)
                if data["device_core_clock_" + gpuIdx] is None:
                    clock_core = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][1]
                    try:
                        data["device_core_clock_" + gpuIdx] = int(clock_core)
                        self.debug(
                            'Using legacy core_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_core))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_core_clock_" + gpuIdx)
                if data["device_mem_clock_" + gpuIdx] is None:
                    clock_mem = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][2]
                    try:
                        data["device_mem_clock_" + gpuIdx] = int(clock_mem)
                        self.debug(
                            'Using legacy mem_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_mem))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_clock_" + gpuIdx)

        return data
Beispiel #40
0
def gpu_temp(handle=None, deviceID=0, cmap='cool', **kwargs):
    temp = nvmlDeviceGetTemperature(handle, deviceID)
    norm = min(max(temp - 30., 0.), 25.)/25.
    return plt.get_cmap(cmap)(norm, bytes=True)[:3]