Exemple #1
0
 def get_devices(self):
     """ Return name of devices """
     self.initialize()
     if self.device_count == 0:
         names = list()
     elif IS_MACOS:
         names = [pynvx.cudaGetName(handle, ignore=True)
                  for handle in self.handles]
     else:
         names = [pynvml.nvmlDeviceGetName(handle).decode("utf-8")
                  for handle in self.handles]
     if self.logger:
         self.logger.debug("GPU Devices: %s", names)
     return names
Exemple #2
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = (nv_process.usedGpuMemory // MB if
                           nv_process.usedGpuMemory else None)
                process['gpu_memory_usage'] = usedmem
                # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem)
                process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = "%d MiB" % (
                #     round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['cpu_memory_usage'] = (
                    round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    try:
                        process['cpu_percent'] = cache_process.cpu_percent()
                    except psutil.NoSuchProcess:
                        process['cpu_percent'] = 0.0
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        process['cpu_percent'] = 0.0
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else 0,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else 0,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else 0,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else 0,
                'memory.total': memory.total // MB if memory else 0,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemple #3
0
def get_infos():
    """Get all information about all your graphics cards.

    Returns:
        dict: The returned result is a dict with 3 keys: count, driver_version and devices:
            count: Number of gpus found
            driver_version: The version of the system’s graphics driver
            devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. 
                     It should be noted that the Process field is also a namedtuple which has 11 fields.
    """

    infos = {}
    Device = namedtuple(
        "Device",
        [
            "id",
            "name",
            "free",
            "used",
            "total",
            "temperature",
            "fan_speed",
            "power_usage",
            "power_state",
            "process",
        ],
    )
    Process = namedtuple(
        "Process",
        [
            "pid",
            "memory_percent",
            "status",
            "username",
            "num_threads",
            "cpu_num",
            "cpu_percent",
            "name",
            "cmdline",
            "used_gpu_mem",
            "create_time",
        ],
    )
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        power_usage = pynvml.nvmlDeviceGetPowerUsage(
            handle)  # Power usage in milliwatts mW
        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
            handle)  # Which processes are using the GPU
        # process_info = [(item.pid, item.usedGpuMemory) for item in process_info]
        process_info = []
        for p in processes:
            # append Process object to process_info
            pid = p.pid
            used_gpu_mem = p.usedGpuMemory
            p = psutil.Process(pid=pid)
            _ = p.cpu_percent()
            time.sleep(0.05)
            process_info.append(
                Process(
                    pid=pid,
                    memory_percent=p.memory_percent(),
                    status=p.status(),
                    username=p.username(),
                    num_threads=p.num_threads(),
                    cpu_num=p.cpu_num(),
                    cpu_percent=p.cpu_percent(),
                    name=p.name(),
                    cmdline=" ".join(p.cmdline()),
                    used_gpu_mem=used_gpu_mem,
                    create_time=p.create_time(),
                ))
        try:
            fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported as e:
            fan_speed = None
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_state = pynvml.nvmlDeviceGetPowerState(handle)
        temperature = pynvml.nvmlDeviceGetTemperature(
            handle, pynvml.NVML_TEMPERATURE_GPU)
        devices.append(
            Device(
                id=i,
                name=name,
                free=mem_info.free,
                used=mem_info.used,
                total=mem_info.total,
                temperature=temperature,
                fan_speed=fan_speed,
                power_usage=power_usage,
                power_state=power_state,
                process=process_info,
            ))

    infos["count"] = device_count
    infos["driver_version"] = driver_version
    infos["devices"] = devices
    return infos
Exemple #4
0
	def _get_data(self):
		data = {}

		if self.deviceCount:
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				name = pynvml.nvmlDeviceGetName(handle)
				brand = pynvml.nvmlDeviceGetBrand(handle)
				brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan']

				### Get data ###
				## Memory usage
				try:
					mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
				except Exception as e:
					self.debug(str(e))
					mem = None

				## ECC errors
				try:
					_memError = {}
					_eccCounter = {}
					eccErrors = {}
					eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
					memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED']
					memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY']
					for memoryLocation in range(5):
						for eccCounter in range(2):
							for memError in range(2):
								_memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation)
							_eccCounter[eccCounterType[eccCounter]] = _memError
						eccErrors[memoryLocationType[memoryLocation]] = _eccCounter
				except Exception as e:
					self.debug(str(e))
					eccErrors = None

				## Temperature
				try:
					temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU)
				except Exception as e:
					self.debug(str(e))
					temp = None

				## Fan
				try:
					fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
				except Exception as e:
					self.debug(str(e))
					fanspeed = None

				## GPU and Memory Utilization
				try:
					util = pynvml.nvmlDeviceGetUtilizationRates(handle)
					gpu_util = util.gpu
					mem_util = util.memory
				except Exception as e:
					self.debug(str(e))
					gpu_util = None
					mem_util = None

				## Encoder Utilization
				try:
					encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
					enc_util = encoder[0]
				except Exception as e:
					self.debug(str(e))
					enc_util = None

				## Decoder Utilization
				try:
					decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
					dec_util = decoder[0]
				except Exception as e:
					self.debug(str(e))
					dec_util = None

				## Clock frequencies
				try:
					clock_core = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
					clock_sm = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM)
					clock_mem = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
				except Exception as e:
					self.debug(str(e))
					clock_core = None
					clock_sm = None
					clock_mem = None

				### Packing data ###
				self.debug("Device", gpuIdx, ":", str(name))
				data["device_name_" + gpuIdx] = name

				self.debug("Brand:", str(brands[brand]))

				self.debug(str(name), "Temp      :", str(temp))
				data["device_temp_" + gpuIdx] = temp

				self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
				data["device_mem_total_" + gpuIdx] = mem.total

				self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
				data["device_mem_used_" + gpuIdx] = mem.used

				self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
				data["device_mem_free_" + gpuIdx] = mem.free

				self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
				data["device_load_gpu_" + gpuIdx] = gpu_util

				self.debug(str(name), "Load MEM  :", str(mem_util), '%')
				data["device_load_mem_" + gpuIdx] = mem_util

				self.debug(str(name), "Load ENC  :", str(enc_util), '%')
				data["device_load_enc_" + gpuIdx] = enc_util

				self.debug(str(name), "Load DEC  :", str(dec_util), '%')
				data["device_load_dec_" + gpuIdx] = dec_util

				self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
				data["device_core_clock_" + gpuIdx] = clock_core

				self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
				data["device_sm_clock_" + gpuIdx] = clock_sm

				self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
				data["device_mem_clock_" + gpuIdx] = clock_mem

				self.debug(str(name), "Fan speed :", str(fanspeed), '%')
				data["device_fanspeed_" + gpuIdx] = fanspeed

				self.debug(str(name), "ECC errors:", str(eccErrors))
				if eccErrors is not None:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
				else:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None

		## Get unit (S-class Nvidia cards) data
		if self.unitCount:
			for i in range(self.unitCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlUnitGetHandleByIndex(i)

				try:
					fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
					fan_speed = fan.speed  # Fan speed (RPM)
					fan_state = fan.state  # Flag that indicates whether fan is working properly
				except Exception as e:
					self.debug(str(e))
					fan_speed = None
					fan_state = None

				try:
					psu = pynvml.nvmlUnitGetPsuInfo(handle)
					psu_current = psu.current  # PSU current (A)
					psu_power = psu.power  # PSU power draw (W)
					psu_state = psu.state  # The power supply state
					psu_voltage = psu.voltage  # PSU voltage (V)
				except Exception as e:
					self.debug(str(e))
					psu_current = None
					psu_power = None
					psu_state = None
					psu_voltage = None

				try:
					temp_intake = pynvml.nvmlUnitGetTemperature(handle,0)  # Temperature at intake in C
					temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1)  # Temperature at exhaust in C
					temp_board = pynvml.nvmlUnitGetTemperature(handle,2)  # Temperature on board in C
				except Exception as e:
					self.debug(str(e))
					temp_intake = None
					temp_exhaust = None
					temp_board = None

				self.debug('Unit fan speed:',str(fan_speed))
				data["unit_fan_speed_" + gpuIdx] = fan_speed

				self.debug('Unit fan state:',str(fan_state))
				data["unit_fan_state_" + gpuIdx] = fan_state

				self.debug('Unit PSU current:',str(psu_current))
				data["unit_psu_current_" + gpuIdx] = psu_current

				self.debug('Unit PSU power:', str(psu_power))
				data["unit_psu_power_" + gpuIdx] = psu_power

				self.debug('Unit PSU state:', str(psu_state))
				data["unit_psu_state_" + gpuIdx] = psu_state

				self.debug('Unit PSU voltage:', str(psu_voltage))
				data["unit_psu_voltage_" + gpuIdx] = psu_voltage

				self.debug('Unit temp intake:', str(temp_intake))
				data["unit_temp_intake_" + gpuIdx] = temp_intake

				self.debug('Unit temp exhaust:', str(temp_exhaust))
				data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

				self.debug('Unit temp board:', str(temp_board))
				data["unit_temp_board_" + gpuIdx] = temp_board

		## Get data via legacy mode
		if self.legacy:
			try:
				output, error = Popen(
					[
						"nvidia-settings",
						"-c", ":0",
						"-q", "GPUUtilization",
						"-q", "GPUCurrentClockFreqs",
						"-q", "GPUCoreTemp",
						"-q", "TotalDedicatedGPUMemory",
						"-q", "UsedDedicatedGPUMemory"
					],
					shell=False,
					stdout=PIPE,stderr=PIPE).communicate()
				output = repr(str(output))
				if len(output) < 800:
					raise Exception('Error in fetching data from nvidia-settings ' + output)
				self.debug(str(error), output)
			except Exception as e:
				self.error(str(e))
				self.error('Setting legacy mode to False')
				self.legacy = False
				return data
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				if data["device_temp_" + gpuIdx] is None:
					coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_temp_" + gpuIdx] = int(coreTemp)
						self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp))
					except Exception as e:
						self.debug(str(e), "skipping device_temp_" + gpuIdx)
				if data["device_mem_used_" + gpuIdx] is None:
					memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_mem_used_" + gpuIdx] = int(memUsed)
						self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed))
					except Exception as e:
						self.debug(str(e), "skipping device_mem_used_" + gpuIdx)
				if data["device_load_gpu_" + gpuIdx] is None:
					gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1]
					try:
						data["device_load_gpu_" + gpuIdx] = int(gpu_util)
						self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util))
					except Exception as e:
						self.debug(str(e), "skipping device_load_gpu_" + gpuIdx)
				if data["device_load_mem_" + gpuIdx] is None:
					mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2]
					try:
						data["device_load_mem_" + gpuIdx] = int(mem_util)
						self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util))
					except Exception as e:
						self.debug(str(e), "skipping device_load_mem_" + gpuIdx)
				if data["device_core_clock_" + gpuIdx] is None:
					clock_core = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1]
					try:
						data["device_core_clock_" + gpuIdx] = int(clock_core)
						self.debug('Using legacy core_clock for GPU {0}: {1}'.format(gpuIdx, clock_core))
					except Exception as e:
						self.debug(str(e), "skipping device_core_clock_" + gpuIdx)
				if data["device_mem_clock_" + gpuIdx] is None:
					clock_mem = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2]
					try:
						data["device_mem_clock_" + gpuIdx] = int(clock_mem)
						self.debug('Using legacy mem_clock for GPU {0}: {1}'.format(gpuIdx, clock_mem))
					except Exception as e:
						self.debug(str(e), "skipping device_mem_clock_" + gpuIdx)

		return data
Exemple #5
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_last_used(index):
                last_useds = []
                if not os.path.exists('gpu_history.pkl'):
                    pickle.dump({}, open('gpu_history.pkl', 'wb'))
                with open('gpu_history.pkl', 'rb') as f:
                    history = pickle.load(f)
                    if platform.node() in history:
                        for user, last_used in history[
                                platform.node()][index].items():
                            # 1 day = 24 hours, 1 hour = 3600 seconds
                            used_before = (datetime.now() - last_used['last_used']).days * 24 + \
                                          (datetime.now() - last_used['last_used']).seconds / 3600
                            last_useds.append((user, used_before))
                        return last_useds
                    else:
                        return []

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            last_used = get_last_used(index)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
                'last_used':
                last_used,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemple #6
0
 def getName(self):
     r"""Get obect name"""
     return pynvml.nvmlDeviceGetName(self.handle)
Exemple #7
0
    def setup(self):
        class TimeOutException(Exception):
            pass

        def alarm_handler(signum, frame):
            raise TimeOutException()

        self.data["root"] = os.getcwd()
        program = os.getenv(env.PROGRAM) or util.get_program()
        if program:
            self.data["program"] = program
        else:
            self.data["program"] = '<python with no main file>'
            if wandb._get_python_type() != "python":
                if os.getenv(env.NOTEBOOK_NAME):
                    self.data["program"] = os.getenv(env.NOTEBOOK_NAME)
                else:
                    meta = wandb.jupyter.notebook_metadata()
                    if meta.get("path"):
                        if "fileId=" in meta["path"]:
                            self.data[
                                "colab"] = "https://colab.research.google.com/drive/" + meta[
                                    "path"].split("fileId=")[1]
                            self.data["program"] = meta["name"]
                        else:
                            self.data["program"] = meta["path"]
                            self.data["root"] = meta["root"]

        if not os.getenv(env.DISABLE_CODE):
            logger.debug("code probe starting")
            in_jupyter = wandb._get_python_type() != "python"
            # windows doesn't support alarm() and jupyter could call this in a thread context
            if platform.system() == "Windows" or not hasattr(
                    signal, 'SIGALRM') or in_jupyter:
                logger.debug("non time limited probe of code")
                self._setup_code_git()
                self._setup_code_program()
            else:
                old_alarm = None
                try:
                    try:
                        old_alarm = signal.signal(signal.SIGALRM,
                                                  alarm_handler)
                        signal.alarm(25)
                        self._setup_code_git()
                        self._setup_code_program()
                    finally:
                        signal.alarm(0)
                except TimeOutException:
                    logger.debug("timeout waiting for setup_code")
                finally:
                    if old_alarm:
                        signal.signal(signal.SIGALRM, old_alarm)
            logger.debug("code probe done")

        self.data["startedAt"] = datetime.utcfromtimestamp(
            wandb.START_TIME).isoformat()
        try:
            username = getpass.getuser()
        except KeyError:
            # getuser() could raise KeyError in restricted environments like
            # chroot jails or docker containers.  Return user id in these cases.
            username = str(os.getuid())

        # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users.
        if self._api.settings().get('anonymous') != 'true':
            self.data["host"] = os.environ.get(env.HOST, socket.gethostname())
            self.data["username"] = os.getenv(env.USERNAME, username)
            self.data["executable"] = sys.executable
        else:
            self.data.pop("email", None)
            self.data.pop("root", None)

        self.data["os"] = platform.platform(aliased=True)
        self.data["python"] = platform.python_version()

        if env.get_docker():
            self.data["docker"] = env.get_docker()
        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass
        # TODO: we should use the cuda library to collect this
        if os.path.exists("/usr/local/cuda/version.txt"):
            with open("/usr/local/cuda/version.txt") as f:
                self.data["cuda"] = f.read().split(" ")[-1].strip()
        self.data["args"] = sys.argv[1:]
        self.data["state"] = "running"
Exemple #8
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        gpus_in_use = 0
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
                gpus_in_use += 1 if util_rate.memory > 50.0 else 0
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                self.gauge('nvml.process.count', len(cps), d_tags)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags['puser'] = self.get_process_owner(ps.pid)
                    docker_name, docker_image = self.get_container_name(ps.pid)
                    p_tags['docker_image'] = docker_image
                    p_tags['docker_name'] = docker_name
                    p_tags = self._dict2list(p_tags)
                    print p_tags
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        self.gauge('nvml.gpus_in_use_count', gpus_in_use)
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
 def init(self):
     
     self.util_history = []
     self.temp_history = []
     pynvml.nvmlInit()
     self.gpu_handles = []
     self.deviceCount = pynvml.nvmlDeviceGetCount()
     
     for i in range(self.deviceCount):
         self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
     
     self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6)
     self.cpu_prog_bars = []
     self.gpu_boxes = []
     self.gpu_prog_bars = []
     
     self.prev_idle = []
     self.prev_total = []
     self.idle = []
     self.total = []
     
     #---cpu_box---
     try:
         stat = open("/proc/stat")
         
         statlines = stat.read().splitlines()
         stat.close()
         
         self.corecount = -1
         
         for line in statlines:
             if (line[0:2] == "cp"):
                 self.corecount+= 1
             else:
                 break
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True))
         self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0)
         
         self.prev_idle.append(0)
         self.prev_total.append(0)
         self.idle.append(0)
         self.total.append(0)
     
     #---gpu_boxes---
     for i in range(self.deviceCount):
         product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i])
         product_name = product_name.decode('utf-8')
         
         gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
         
         label = Gtk.Label(product_name)
         
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True))
         
         gpu_box.pack_start(label, True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0)
         
         self.gpu_boxes.append(gpu_box)
     
     #---proc---
     proc_liststore = Gtk.ListStore(int, str, int)
     
     self.tree = Gtk.TreeView(model=proc_liststore)
     
     renderer_pid = Gtk.CellRendererText()
     column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0)
     column_pid.set_resizable(True)
     self.tree.append_column(column_pid)
     
     renderer_path = Gtk.CellRendererText()
     column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1)
     column_path.set_resizable(True)
     column_path.set_fixed_width(250)
     self.tree.append_column(column_path)
     
     renderer_mem = Gtk.CellRendererText()
     column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2)
     column_mem.set_resizable(True)
     self.tree.append_column(column_mem)
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%"+"(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Exemple #11
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info
Exemple #12
0
    def __init__(self,
                 model: TransformerMT,
                 corpus: Corpus,
                 optimizer: torch.optim.Optimizer,
                 stats: Stats,
                 bleu: BLEU,
                 tgt_character_level: bool,
                 buffer_every_steps: int,
                 report_every_steps: int,
                 eval_every_steps: int,
                 num_of_steps: int,
                 eval_type: str,
                 processed_steps: int,
                 learning_rate_schedule: str,
                 update_decay: int,
                 batch_capacity: int,
                 max_save_models: int,
                 grad_norm_clip: float,
                 grad_norm_clip_type: float,
                 annotate: str,
                 device_idxs: [int],
                 gpu_memory_limit: float,
                 ):
        self.model = model
        self.corpus = corpus
        self.optimizer = optimizer
        self.stats = stats
        self.bleu = bleu
        self.tgt_character_level = tgt_character_level

        self.buffer_every_steps = buffer_every_steps
        self.report_every_steps = report_every_steps
        self.eval_every_steps = eval_every_steps
        self.num_of_steps = num_of_steps

        self.eval_type = eval_type
        self.processed_steps = processed_steps
        self.update_decay = update_decay
        self.batch_capacity = batch_capacity

        self.src_pad_idx = self.model.src_pad_idx
        self.tgt_eos_idx = self.model.tgt_eos_idx
        self.tgt_pad_idx = self.model.tgt_pad_idx

        self.max_save_models = max_save_models
        self.grad_norm_clip = grad_norm_clip if grad_norm_clip > 0.0 else None
        self.grad_norm_clip_type = grad_norm_clip_type

        self.annotate = annotate

        self.device_idxs = device_idxs
        self.num_of_devices = len(self.device_idxs)
        self.gpu_memory_limit = gpu_memory_limit

        self.best_acc = 0.0
        self.best_loss = float('inf')
        self.best_bleu = 0.0
        self.best_step = 0

        self.lr_schedule = eval(learning_rate_schedule)
        self.lr = 0.005
        self.backward_factor = list()

        self.loss_report = numpy.zeros(self.report_every_steps, dtype=float)
        self.acc_report = numpy.zeros(self.report_every_steps, dtype=float)
        self.update_decay_steps = numpy.zeros(self.report_every_steps, dtype=int)
        self.src_tokens = numpy.zeros(self.report_every_steps, dtype=int)
        self.tgt_tokens = numpy.zeros(self.report_every_steps, dtype=int)
        self.src_num_pad_tokens = numpy.zeros(self.report_every_steps, dtype=int)
        self.tgt_num_pad_tokens = numpy.zeros(self.report_every_steps, dtype=int)
        self.num_examples = numpy.zeros(self.report_every_steps, dtype=int)
        self.time_sum = 0.0

        self.memory_unit = float(2 ** 30)

        # for uncertainty estimation
        self.esti_variance_every_steps = 1000
        self.tolerance = 4

        nvmlInit()
        print('Driver version: %s' % nvmlSystemGetDriverVersion().decode('utf-8'))

        device_true_idxs = list(int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        self.gpu_info_handler = list(nvmlDeviceGetHandleByIndex(x) for x in device_true_idxs)
        for idx, handler in enumerate(self.gpu_info_handler):
            print('Device no.%d, true idx: %d' % (idx, device_true_idxs[idx]))
            print('\tGPU Name: %s' % nvmlDeviceGetName(handler).decode('utf-8'))

        self.queue = Queue(maxsize=self.num_of_devices)
        self.replicas = list()

        self.async_update_rules = list()
        device_idxs_rules = self.device_idxs.copy()
        while len(device_idxs_rules) > 1:
            rules = dict()
            for i in range(1, len(device_idxs_rules), 2):
                rules[device_idxs_rules[i]] = device_idxs_rules[i - 1]
            device_idxs_rules = device_idxs_rules[::2]
            self.async_update_rules.append(rules)

        return
Exemple #13
0
 def detect_devices(self) -> None:
     self.devices_count = nvmlDeviceGetCount()
     for i in range(self.devices_count):
         handle = nvmlDeviceGetHandleByIndex(i)
         device_name = nvmlDeviceGetName(handle).decode("UTF-8")
         self.logger.info(f"Device nr. {i}: '{device_name}'")
Exemple #14
0
"""
show_str_tot_lst = []

drv_ver = pml.nvmlSystemGetDriverVersion()
show_str_tot_lst.append('Driver Version: ' + bytes.decode(drv_ver))
show_str_tot_lst.append('{:<4}{:12}{:<13}{:6}{:6}{:8}{:12}{:8}{:<10}'.format(
    'id', 'type', 'video memory', 'temp.', 'util.', 'pid', 'process', 'users',
    'MemUsed'))

for i in range(deviceCount):
    handle = pml.nvmlDeviceGetHandleByIndex(i)
    show_str_lst = []
    show_str_lst.append(str(i) + '  ')

    # 获取显卡全名
    card_name = pml.nvmlDeviceGetName(handle)
    card_name = bytes.decode(card_name)
    card_name = ''.join(card_name.split(' ')[1:])
    show_str_lst.append(card_name)

    # 显存使用情况
    mem_info = pml.nvmlDeviceGetMemoryInfo(handle)
    mem_total = '{:6}'.format(mem_info.total // mega) + 'M'
    mem_free = '{:6}'.format(mem_info.free // mega) + 'M'
    mem_used = '{:<6}'.format(str(mem_info.used // mega) + 'M')
    show_str_lst.append('   ' + mem_used + '/' + mem_total)

    # 温度
    card_temp = ' ' + str(
        pml.nvmlDeviceGetTemperature(handle, pml.NVML_TEMPERATURE_GPU)) + 'C'
    show_str_lst.append(card_temp)
Exemple #15
0
pip install nvidia-ml-py2
#python3
pip install nvidia-ml-py3

import pynvml

pynvml.nvmlInit()
print('显示驱动信息: ')
print("Driver: ", pynvml.nvmlSystemGetDriverVersion()) 
print('--------------')
print('设备信息: ')
deviceCount = pynvml.nvmlDeviceGetCount()
print('  共 %s 块 GPU,名称为:'%deviceCount)
for i in range(deviceCount):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    print("    GPU", i, ":", pynvml.nvmlDeviceGetName(handle))
print('--------------')
for i in range(deviceCount):
    print('查看第 %s 块GPU的显存、温度、风扇、电源: '%i)
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print("Memory Total: %0.2f G"%(info.total/1024/1024/1024)) # 总的显存大小
    print("Memory Free: %0.2f G "%(info.free/1024/1024/1024)) # 剩余显存大小
    print("Memory Used: %0.2f G "%(info.used/1024/1024/1024)) 
    print("Memory Used percent: %0.2f %% "%(info.used/info.total*100))
    print("Temperature is %d C"%(pynvml.nvmlDeviceGetTemperature(handle,0)))
    print("Fan speed is ",pynvml.nvmlDeviceGetFanSpeed(handle))
    print("Power ststus",pynvml.nvmlDeviceGetPowerState(handle))
    print('--------------')

#最后要关闭管理工具
Exemple #16
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemple #17
0
from pynvml import (
    nvmlInit,
    nvmlShutdown,
    nvmlSystemGetDriverVersion,
    nvmlDeviceGetCount,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetName,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetUtilizationRates,
)

nvmlInit()
print("Driver Version: %s" % nvmlSystemGetDriverVersion())

deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvmlDeviceGetHandleByIndex(i)
    print("Device %s: %s" % (i, nvmlDeviceGetName(handle)))

    memory_info = nvmlDeviceGetMemoryInfo(handle)
    print("Device %s: Total memory: %s" % (i, memory_info.total / 1024 / 1024))
    print("Device %s: Free memory: %s" % (i, memory_info.free / 1024 / 1024))
    print("Device %s: Used memory: %s" % (i, memory_info.used / 1024 / 1024))

    util = nvmlDeviceGetUtilizationRates(handle)
    print("Device %s: GPU Utilization: %s%%" % (i, util.gpu))
    print("Device %s: Memory Utilization: %s%%" % (i, util.memory))

nvmlShutdown()
Exemple #18
0
def _get_gpu_type(gpu_device):
    return nvmlDeviceGetName(gpu_device)
Exemple #19
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Exemple #20
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemple #21
0
def get_gpu_name(handle):
    """Returns the name of the GPU device
    https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481
    """
    name = pynvml.nvmlDeviceGetName(handle)
    return to_utf8(name)
Exemple #22
0
def one_time():
    h = _pynvml_handles()
    return {
        "memory-total": pynvml.nvmlDeviceGetMemoryInfo(h).total,
        "name": pynvml.nvmlDeviceGetName(h).decode(),
    }
Exemple #23
0
 def get_devices(self):
     """ Return total vram in megabytes per device """
     vram = [pynvml.nvmlDeviceGetName(handle).decode("utf-8")
             for handle in self.handles]
     return vram
Exemple #24
0
if no_gather_nvml_gpu_info:
    logging.debug("--no-gather-nvml-gpu-info passed, " +
                  "using blank values for source database GPU info fields " +
                  "[gpu_driver_ver, run_gpu_name] ")
elif conn_machine_name == "localhost" or gather_nvml_gpu_info:
    logging.debug("Gathering source database GPU info fields " +
                  "[gpu_driver_ver, run_gpu_name] " +
                  "from local GPU using pynvml. ")
    import pynvml

    pynvml.nvmlInit()
    source_db_gpu_driver_ver = pynvml.nvmlSystemGetDriverVersion().decode()
    for i in range(source_db_gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        # Assume all cards are the same, overwrite name value
        source_db_gpu_name = pynvml.nvmlDeviceGetName(handle).decode()
    pynvml.nvmlShutdown()
# If gpu_count argument passed in, override gathered value
if gpu_count:
    source_db_gpu_count = gpu_count
# Set machine names, using local info if connected to localhost
if conn_machine_name == "localhost":
    local_uname = os.uname()
if machine_name:
    run_machine_name = machine_name
else:
    if conn_machine_name == "localhost":
        run_machine_name = local_uname.nodename.split(".")[0]
    else:
        run_machine_name = conn_machine_name
if machine_uname:
import pynvml

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
device_name = pynvml.nvmlDeviceGetName(handle)

if device_name != b'Tesla T4':
  raise Exception("""
    Unfortunately Colab didn't give you a T4 GPU.
    
    Make sure you've configured Colab to request a GPU instance type.
    
    If you get a K80 GPU, try Runtime -> Reset all runtimes...
  """)
else:
  print('Woo! You got the right kind of GPU!')
Exemple #26
0
def get_device_name(device_handle):
    """Get GPU device name"""
    try:
        return pynvml.nvmlDeviceGetName(device_handle)
    except pynvml.NVMlError:
        return "NVIDIA"
Exemple #27
0
def _get_name(h):
    try:
        return pynvml.nvmlDeviceGetName(h).decode()
    except pynvml.NVMLError_NotSupported:
        return None
Exemple #28
0
            sysdata['ssd0_exist'] = False

        if os.path.isdir('/ssd1'):
            ssd1_usage = psutil.disk_usage('/ssd1')
            sysdata['ssd1_exist'] = True
            sysdata['ssd1_used'] = toGB(ssd1_usage.used)
            sysdata['ssd1_total'] = toGB(ssd1_usage.total)
        else:
            sysdata['ssd1_exist'] = False

        procs = deviceCount * [None]
        gpu_error = deviceCount * [False]
        for i in range(deviceCount):
            try:
                handle = nvmlDeviceGetHandleByIndex(i)
                name = nvmlDeviceGetName(handle)
                gpudata[i]['name'] = name.decode('utf-8')

                memInfo = nvmlDeviceGetMemoryInfo(handle)
                gpudata[i]['mem_free'] = toMB(memInfo.total - memInfo.used)
                gpudata[i]['mem_total'] = toMB(memInfo.total)
                gpudata[i]['mem_usage'] = memInfo.used / memInfo.total * 100

                procs_prefilter = nvmlDeviceGetComputeRunningProcesses(handle)
                # for unknown reasons, nvmlDeviceGetComputeRunningProcesses
                # sometimes returns nonexistent processes on 3090 GPUs
                procs[i] = []
                gpudata[i]['procs'] = []
                for p in procs_prefilter:
                    try:
                        P = psutil.Process(p.pid)
Exemple #29
0
    def setup(self):
        self.data["root"] = os.getcwd()
        try:
            import __main__
            self.data["program"] = __main__.__file__
        except (ImportError, AttributeError):
            self.data["program"] = '<python with no main file>'
            if wandb._get_python_type() != "python":
                if os.getenv(env.NOTEBOOK_NAME):
                    self.data["program"] = os.getenv(env.NOTEBOOK_NAME)
                else:
                    meta = wandb.jupyter.notebook_metadata()
                    if meta.get("path"):
                        if "fileId=" in meta["path"]:
                            self.data[
                                "colab"] = "https://colab.research.google.com/drive/" + meta[
                                    "path"].split("fileId=")[1]
                            self.data["program"] = meta["name"]
                        else:
                            self.data["program"] = meta["path"]
                            self.data["root"] = meta["root"]

        program = os.path.join(self.data["root"], self.data["program"])
        if not os.getenv(env.DISABLE_CODE):
            if self._api.git.enabled:
                self.data["git"] = {
                    "remote": self._api.git.remote_url,
                    "commit": self._api.git.last_commit
                }
                self.data["email"] = self._api.git.email
                self.data["root"] = self._api.git.root or self.data["root"]

            if os.path.exists(program) and self._api.git.is_untracked(
                    self.data["program"]):
                util.mkdir_exists_ok(
                    os.path.join(self.out_dir, "code",
                                 os.path.dirname(self.data["program"])))
                saved_program = os.path.join(self.out_dir, "code",
                                             self.data["program"])
                if not os.path.exists(saved_program):
                    self.data["codeSaved"] = True
                    copyfile(program, saved_program)

        self.data["startedAt"] = datetime.utcfromtimestamp(
            wandb.START_TIME).isoformat()
        self.data["host"] = os.environ.get(env.HOST, socket.gethostname())
        try:
            username = getpass.getuser()
        except KeyError:
            # getuser() could raise KeyError in restricted environments like
            # chroot jails or docker containers.  Return user id in these cases.
            username = str(os.getuid())
        self.data["username"] = os.getenv(env.USERNAME, username)
        self.data["os"] = platform.platform(aliased=True)
        self.data["python"] = platform.python_version()
        self.data["executable"] = sys.executable
        if env.get_docker():
            self.data["docker"] = env.get_docker()
        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass
        # TODO: we should use the cuda library to collect this
        if os.path.exists("/usr/local/cuda/version.txt"):
            self.data["cuda"] = open(
                "/usr/local/cuda/version.txt").read().split(" ")[-1].strip()
        self.data["args"] = sys.argv[1:]
        self.data["state"] = "running"
Exemple #30
0
import pynvml as nv
import time
import os
import sys
#import psutil

gpu_id = 0
query_interval = 0.5
nv.nvmlInit()
handle = nv.nvmlDeviceGetHandleByIndex(gpu_id)
print "Driver Version: ", nv.nvmlSystemGetDriverVersion()
print "GPU", gpu_id, "Device Name: ", nv.nvmlDeviceGetName(handle)
while (1):
    try:
        memory = nv.nvmlDeviceGetMemoryInfo(handle)
        device_util = nv.nvmlDeviceGetUtilizationRates(handle)
        print "Memory total:", memory.total / 1024 / 1024, "M.   ", "Memory used:", memory.used / 1024 / 1024, "M."
        print "Memory-util: %.2f" % (
            memory.used * 100.0 /
            memory.total), "%.     ", "GPU-util:", device_util.gpu, "%."
        time.sleep(query_interval)
    except IndexError, e:
        nv.nvmlShutdown()
        print "process  terminal!"
        sys.exit()
'''
while(1):
    try:
        pid_obj = os.popen('pgrep -f matrix_apps_config_8801')
        pid = int(pid_obj.read().split()[0])
        print pid
Exemple #31
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory /
                                                  1024 / 1024)
                process['pid'] = nv_process.pid
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except:
                power_limit = None

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None  # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process.pid)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                int(power / 1000) if power is not None else None,
                'enforced.power.limit':
                int(power_limit / 1000) if power is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                int(memory.used / 1024 / 1024) if memory else None,
                'memory.total':
                int(memory.total / 1024 / 1024) if memory else None,
                'processes':
                processes,
            }
            return gpu_info
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%" + "(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Exemple #33
0
def get_device_name(device_handle):
    """Get GPU device name."""
    try:
        return nativestr(pynvml.nvmlDeviceGetName(device_handle))
    except pynvml.NVMlError:
        return "NVIDIA"
Exemple #34
0
    def get(self):
        """Write the web page content."""
        global cpu_load
        global gpu_load_compute
        global gpu_load_memory
        memory = psutil.virtual_memory()
        swap = psutil.swap_memory()
        if nvidia:
            nvmlHandle = nvmlDeviceGetHandleByIndex(0)
            gpu = nvmlDeviceGetName(nvmlHandle).decode('utf-8')
            gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle)
            gpu_ram = round(gpu_memory.total / (1024 * 1048576), 2)
            gpu += " - " + str(gpu_ram) + "GB"
        else:
            gpu = "Not recognized"
        ram = str(int(round(float(memory.total) / (1024 * 1048576)))) + "GB"
        ram += " (swap: " + str(int(round(float(swap.total) / (1024 * 1048576)))) + "GB)"
        real_cores = psutil.cpu_count(False)
        cores_ratio = int(psutil.cpu_count(True) / real_cores)
        cores = " (" + str(cores_ratio) + "x " + str(real_cores) + " cores)"
        if sys.platform.startswith('linux'):
            distribution = distro.linux_distribution()
            os_name = 'Linux ' + distribution[0] + " " + distribution[1] + " " + distribution[2]
            command = "cat /proc/cpuinfo"
            all_info = subprocess.check_output(command, shell=True).decode('utf-8').strip()
            for line in all_info.split("\n"):
                if "model name" in line:
                    cpu = re.sub(".*model name.*:", "", line, 1)
                    break
        elif sys.platform == 'win32':
            computer = wmi.WMI()
            os_info = computer.Win32_OperatingSystem()[0]
            cpu = computer.Win32_Processor()[0].Name
            os_name = os_info.Name.split('|')[0] + ", version " + os_info.Version
        elif sys.platform == 'darwin':
            os_name = 'macOS ' + platform.mac_ver()[0]
            os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
            command = 'sysctl -n machdep.cpu.brand_string'
            cpu = subprocess.check_output(command).strip()
        else:  # unknown platform
            os_name = 'Unknown'
            cpu = 'Unknown'
        self.write("<!DOCTYPE html>\n")
        self.write("<html><head><meta charset='utf-8'/><title>Webots simulation server</title>")
        self.write("<link rel='stylesheet' type='text/css' href='css/monitor.css'></head>\n")
        self.write("<body><h1>Webots simulation server: " + socket.getfqdn() + "</h1>")
        self.write("<h2>Host: " + os_name + "</h2>\n")
        self.write("<p><b>CPU load: %g%%</b><br>\n" % cpu_load)
        self.write(cpu + cores + "</p>\n")
        self.write("<p><b>GPU load compute: %g%% &mdash; load memory: %g%%</b><br>\n" %
                   (gpu_load_compute, gpu_load_memory))
        self.write(gpu + "</p>\n")
        self.write("<p><b>RAM:</b><br>" + ram + "</p>\n")
        self.write("<canvas id='graph' height='400' width='1024'></canvas>\n")
        self.write("<script src='https://www.cyberbotics.com/harry-plotter/0.9f/harry.min.js'></script>\n")
        self.write("<script>\n")
        self.write("window.onload = function() {\n")

        def appendData(label):
            global snapshots
            d = "{title:'" + label + "',values:["
            for s in snapshots:
                d += str(s.data[label]) + ','
            return d[:-1] + "]},"

        datas = ''
        datas += appendData('Webots running')
        datas += appendData('Webots idle')
        datas += appendData('CPU load')
        datas += appendData('CPU memory')
        datas += appendData('GPU load compute')
        datas += appendData('GPU load memory')
        datas += appendData('GPU memory')
        datas += appendData('Swap')
        datas += appendData('Disk')
        datas += appendData('Network sent')
        datas += appendData('Network received')

        datas = datas[:-1]  # remove the last coma
        self.write("  plotter({\n")
        self.write("    canvas: 'graph',\n")
        self.write("    datas:[ " + datas + "],\n")
        self.write("""
     labels:{
        ypos:"left",
        x:100,
        y:[50,100],
        marks:2
     },
     fill:"none",
     opacity:0.5,
     linewidth:3,
     background:"#fff",
     autoscale:"top",
     grid:{
        x:[0,100]
     },
     mouseover:{
        radius:4,
        linewidth:2,
        bullet:"#444",
        shadowbox:"1,1,0,#000",
        axis:"x"
     }
  });""")
        self.write("}\n")
        self.write("</script>\n")
        self.write("</body></html>")
Exemple #35
0
 def getName(self):
     return pynvml.nvmlDeviceGetName(self.handle)
Exemple #36
0
def device_name_for(device_handle):
    """Get GPU device name"""
    try:
        return nativestr(pynvml.nvmlDeviceGetName(device_handle))
    except pynvml.NVMlError:
        return "NVIDIA"
Exemple #37
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                if process['username'] == 'root':
                    out = subprocess.check_output(
                        'docker inspect --format "{{.Name}}" "$(cat /proc/' +
                        str(process['pid']) +
                        '/cgroup |head -n 1 |cut -d / -f 3)" | sed "s/^\///"',
                        shell="True")
                    if "Error" not in out:
                        process['username'] = out
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info
Exemple #38
0
 def get_device_name(self, device_handle):
     """Get GPU device name"""
     try:
         return pynvml.nvmlDeviceGetName(device_handle)
     except pynvml.NVMlError:
         return "NVIDIA"