Example #1
0
 def cb():
     src_dict = {}
     src_dict["pci-tx"] = [pynvml.nvmlDeviceGetPcieThroughput(
         gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)]
     src_dict["pci-rx"] = [pynvml.nvmlDeviceGetPcieThroughput(
         gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)]
     source.data.update(src_dict)
Example #2
0
    def cb():
        nonlocal last_time
        now = time.time()
        src_dict = {"time": [now * 1000]}
        gpu_tot = 0
        mem_tot = 0
        tx_tot = 0
        rx_tot = 0
        for i in range(ngpus):
            gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
            mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used
            tx = (pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024)
            rx = (pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024)
            gpu_tot += gpu
            mem_tot += mem / (1024 * 1024)
            rx_tot += rx
            tx_tot += tx
            src_dict["gpu-" + str(i)] = [gpu]
            src_dict["memory-" + str(i)] = [mem]
        src_dict["gpu-total"] = [gpu_tot / ngpus]
        src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100]
        src_dict["tx-total"] = [tx_tot]
        src_dict["rx-total"] = [rx_tot]

        source.stream(src_dict, 1000)

        last_time = now
Example #3
0
def test_nvmlDeviceGetPcieThroughput(ngpus, handles):
    for i in range(ngpus):
        tx_bytes_tp = pynvml.nvmlDeviceGetPcieThroughput(
            handles[i], NVML_PCIE_UTIL_TX_BYTES)
        assert tx_bytes_tp >= 0
        rx_bytes_tp = pynvml.nvmlDeviceGetPcieThroughput(
            handles[i], NVML_PCIE_UTIL_RX_BYTES)
        assert rx_bytes_tp >= 0
        count_tp = pynvml.nvmlDeviceGetPcieThroughput(handles[i],
                                                      NVML_PCIE_UTIL_COUNT)
        assert count_tp >= 0
Example #4
0
def pci(doc):

    tx_fig = figure(title="TX Bytes [MB/s]",
                    sizing_mode="stretch_both", y_range=[0, 5000])
    pci_tx = [pynvml.nvmlDeviceGetPcieThroughput(
        gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)]
    left = list(range(len(pci_tx)))
    right = [l + 0.8 for l in left]
    source = ColumnDataSource({"left": left, "right": right, "pci-tx": pci_tx})
    mapper = LinearColorMapper(
        palette=all_palettes['RdYlBu'][4], low=0, high=5000)

    tx_fig.quad(
        source=source, left="left", right="right", bottom=0, top="pci-tx", color={"field": "pci-tx", "transform": mapper}
    )

    rx_fig = figure(title="RX Bytes [MB/s]",
                    sizing_mode="stretch_both", y_range=[0, 5000])
    pci_rx = [pynvml.nvmlDeviceGetPcieThroughput(
        gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)]
    left = list(range(len(pci_rx)))
    right = [l + 0.8 for l in left]
    source = ColumnDataSource({"left": left, "right": right, "pci-rx": pci_rx})
    mapper = LinearColorMapper(
        palette=all_palettes['RdYlBu'][4], low=0, high=5000)

    rx_fig.quad(
        source=source, left="left", right="right", bottom=0, top="pci-rx", color={"field": "pci-rx", "transform": mapper}
    )

    doc.title = "PCI Throughput"
    doc.add_root(
        column(tx_fig, rx_fig, sizing_mode="stretch_both")
    )

    def cb():
        src_dict = {}
        src_dict["pci-tx"] = [pynvml.nvmlDeviceGetPcieThroughput(
            gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)]
        src_dict["pci-rx"] = [pynvml.nvmlDeviceGetPcieThroughput(
            gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)]
        source.data.update(src_dict)

    doc.add_periodic_callback(cb, 200)
Example #5
0
def pci(doc):

    # Use device-0 to get "upper bound"
    pci_gen = pynvml.nvmlDeviceGetMaxPcieLinkGeneration(gpu_handles[0])
    pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0])
    pci_bw = {
        # Keys = PCIe-Generation, Values = Max PCIe Lane BW (per direction)
        # [Note: Using specs at https://en.wikipedia.org/wiki/PCI_Express]
        1: (250.0 / 1024.0),
        2: (500.0 / 1024.0),
        3: (985.0 / 1024.0),
        4: (1969.0 / 1024.0),
        5: (3938.0 / 1024.0),
        6: (7877.0 / 1024.0),
    }
    # Max PCIe Throughput = (BW-per-lane / Width)
    max_rxtx_tp = pci_width * pci_bw[pci_gen]

    pci_tx = [
        pynvml.nvmlDeviceGetPcieThroughput(gpu_handles[i],
                                           pynvml.NVML_PCIE_UTIL_TX_BYTES) /
        (1024.0 * 1024.0)  # Convert KB/s -> GB/s
        for i in range(ngpus)
    ]

    pci_rx = [
        pynvml.nvmlDeviceGetPcieThroughput(gpu_handles[i],
                                           pynvml.NVML_PCIE_UTIL_RX_BYTES) /
        (1024.0 * 1024.0)  # Convert KB/s -> GB/s
        for i in range(ngpus)
    ]

    left = list(range(ngpus))
    right = [l + 0.8 for l in left]
    source = ColumnDataSource({
        "left": left,
        "right": right,
        "pci-tx": pci_tx,
        "pci-rx": pci_rx
    })
    mapper = LinearColorMapper(palette=all_palettes["RdYlBu"][4],
                               low=0,
                               high=max_rxtx_tp)

    tx_fig = figure(title="TX Bytes [GB/s]",
                    sizing_mode="stretch_both",
                    y_range=[0, max_rxtx_tp])
    tx_fig.quad(
        source=source,
        left="left",
        right="right",
        bottom=0,
        top="pci-tx",
        color={
            "field": "pci-tx",
            "transform": mapper
        },
    )
    tx_fig.toolbar_location = None

    rx_fig = figure(title="RX Bytes [GB/s]",
                    sizing_mode="stretch_both",
                    y_range=[0, max_rxtx_tp])
    rx_fig.quad(
        source=source,
        left="left",
        right="right",
        bottom=0,
        top="pci-rx",
        color={
            "field": "pci-rx",
            "transform": mapper
        },
    )
    rx_fig.toolbar_location = None

    doc.title = "PCI Throughput"
    doc.add_root(column(tx_fig, rx_fig, sizing_mode="stretch_both"))

    def cb():
        src_dict = {}
        src_dict["pci-tx"] = [
            pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) /
            (1024.0 * 1024.0)  # Convert KB/s -> GB/s
            for i in range(ngpus)
        ]
        src_dict["pci-rx"] = [
            pynvml.nvmlDeviceGetPcieThroughput(
                gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) /
            (1024.0 * 1024.0)  # Convert KB/s -> GB/s
            for i in range(ngpus)
        ]
        source.data.update(src_dict)

    doc.add_periodic_callback(cb, 200)
	def _get_data(self):
		data = {}

		if self.deviceCount:
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				name = pynvml.nvmlDeviceGetName(handle)
				brand = pynvml.nvmlDeviceGetBrand(handle)
				brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan']

				### Get data ###
				## Memory usage
				try:
					mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
				except Exception as e:
					self.debug(str(e))
					mem = None

				## ECC errors
				try:
					_memError = {}
					_eccCounter = {}
					eccErrors = {}
					eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
					memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED']
					memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY']
					for memoryLocation in range(5):
						for eccCounter in range(2):
							for memError in range(2):
								_memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation)
							_eccCounter[eccCounterType[eccCounter]] = _memError
						eccErrors[memoryLocationType[memoryLocation]] = _eccCounter
				except Exception as e:
					self.debug(str(e))
					eccErrors = None

				## Temperature
				try:
					temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU)
				except Exception as e:
					self.debug(str(e))
					temp = None

				## Fan
				try:
					fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
				except Exception as e:
					self.debug(str(e))
					fanspeed = None

				## Power
				try:
					power = pynvml.nvmlDeviceGetPowerUsage(handle)
				except Exception as e:
					self.debug(str(e))
					power = None

				## GPU and Memory Utilization
				try:
					util = pynvml.nvmlDeviceGetUtilizationRates(handle)
					gpu_util = util.gpu
					mem_util = util.memory
				except Exception as e:
					self.debug(str(e))
					gpu_util = None
					mem_util = None

				## PCI Express Bandwidth Utilization
				try: 
					pcie_tx = pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES)
					pcie_rx = pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES)
				except Exception as e:
					self.debug(str(e))
					pcie_tx = None
					pcie_rx = None

				### Packing data ###
				self.debug("Device", gpuIdx, ":", str(name))
				data["device_name_" + gpuIdx] = name

				self.debug("Brand:", str(brands[brand]))

				self.debug(str(name), "Temp      :", str(temp))
				data["device_temp_" + gpuIdx] = temp

				self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
				data["device_mem_total_" + gpuIdx] = mem.total

				self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
				data["device_mem_used_" + gpuIdx] = mem.used

				self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
				data["device_mem_free_" + gpuIdx] = mem.free

				self.debug(str(name), "Utilization GPU  :", str(gpu_util), '%')
				data["device_util_gpu_" + gpuIdx] = gpu_util

				self.debug(str(name), "Utilization MEM  :", str(mem_util), '%')
				data["device_util_mem_" + gpuIdx] = mem_util

				self.debug(str(name), "Utilization PCIE TX  :", str(pcie_tx), '%')
				data["device_util_pcie_tx_" + gpuIdx] = pcie_tx

				self.debug(str(name), "Utilization PCIE RX  :", str(pcie_rx), '%')
				data["device_util_pcie_rx_" + gpuIdx] = pcie_rx

				self.debug(str(name), "Fan speed :", str(fanspeed), '%')
				data["device_fanspeed_" + gpuIdx] = fanspeed

				self.debug(str(name), "Power Usage :", str(power), 'Watt')
				data["device_power_" + gpuIdx] = power

				self.debug(str(name), "ECC errors:", str(eccErrors))
				if eccErrors is not None:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
				else:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None

		## Get unit (S-class Nvidia cards) data
		if self.unitCount:
			for i in range(self.unitCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlUnitGetHandleByIndex(i)

				try:
					fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
					fan_speed = fan.speed  # Fan speed (RPM)
					fan_state = fan.state  # Flag that indicates whether fan is working properly
				except Exception as e:
					self.debug(str(e))
					fan_speed = None
					fan_state = None

				try:
					psu = pynvml.nvmlUnitGetPsuInfo(handle)
					psu_current = psu.current  # PSU current (A)
					psu_power = psu.power  # PSU power draw (W)
					psu_state = psu.state  # The power supply state
					psu_voltage = psu.voltage  # PSU voltage (V)
				except Exception as e:
					self.debug(str(e))
					psu_current = None
					psu_power = None
					psu_state = None
					psu_voltage = None

				try:
					temp_intake = pynvml.nvmlUnitGetTemperature(handle,0)  # Temperature at intake in C
					temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1)  # Temperature at exhaust in C
					temp_board = pynvml.nvmlUnitGetTemperature(handle,2)  # Temperature on board in C
				except Exception as e:
					self.debug(str(e))
					temp_intake = None
					temp_exhaust = None
					temp_board = None

				self.debug('Unit fan speed:',str(fan_speed))
				data["unit_fan_speed_" + gpuIdx] = fan_speed

				self.debug('Unit fan state:',str(fan_state))
				data["unit_fan_state_" + gpuIdx] = fan_state

				self.debug('Unit PSU current:',str(psu_current))
				data["unit_psu_current_" + gpuIdx] = psu_current

				self.debug('Unit PSU power:', str(psu_power))
				data["unit_psu_power_" + gpuIdx] = psu_power

				self.debug('Unit PSU state:', str(psu_state))
				data["unit_psu_state_" + gpuIdx] = psu_state

				self.debug('Unit PSU voltage:', str(psu_voltage))
				data["unit_psu_voltage_" + gpuIdx] = psu_voltage

				self.debug('Unit temp intake:', str(temp_intake))
				data["unit_temp_intake_" + gpuIdx] = temp_intake

				self.debug('Unit temp exhaust:', str(temp_exhaust))
				data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

				self.debug('Unit temp board:', str(temp_board))
				data["unit_temp_board_" + gpuIdx] = temp_board

		## Get data via legacy mode
		if self.legacy:
			try:
				output, error = Popen(
					[
						"nvidia-settings",
						"-c", ":0",
						"-q", "GPUUtilization",
						"-q", "GPUCurrentClockFreqs",
						"-q", "GPUCoreTemp",
						"-q", "TotalDedicatedGPUMemory",
						"-q", "UsedDedicatedGPUMemory"
					],
					shell=False,
					stdout=PIPE,stderr=PIPE).communicate()
				output = repr(str(output))
				if len(output) < 800:
					raise Exception('Error in fetching data from nvidia-settings ' + output)
				self.debug(str(error), output)
			except Exception as e:
				self.error(str(e))
				self.error('Setting legacy mode to False')
				self.legacy = False
				return data
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				if data["device_temp_" + gpuIdx] is None:
					coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_temp_" + gpuIdx] = int(coreTemp)
						self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp))
					except Exception as e:
						self.debug(str(e), "skipping device_temp_" + gpuIdx)
				if data["device_mem_used_" + gpuIdx] is None:
					memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_mem_used_" + gpuIdx] = int(memUsed)
						self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed))
					except Exception as e:
						self.debug(str(e), "skipping device_mem_used_" + gpuIdx)
				if data["device_util_gpu_" + gpuIdx] is None:
					gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1]
					try:
						data["device_util_gpu_" + gpuIdx] = int(gpu_util)
						self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util))
					except Exception as e:
						self.debug(str(e), "skipping device_util_gpu_" + gpuIdx)
				if data["device_util_mem_" + gpuIdx] is None:
					mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2]
					try:
						data["device_util_mem_" + gpuIdx] = int(mem_util)
						self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util))
					except Exception as e:
						self.debug(str(e), "skipping device_util_mem_" + gpuIdx)

		return data
Example #7
0
def log_system(log_file, process_pids=None):
    """
    Logs system utilization metrics to log file
    """
    # log cpu util
    cpu_util = psutil.cpu_percent()
    cpu_util_ind = psutil.cpu_percent(percpu=True)
    ts = time.time()
    key = "INFO"
    message = "CPU util: {}% -- Individual utils 1-24: {}".format(
        cpu_util, cpu_util_ind[:24])
    write_to_log(log_file, (ts, key, message))
    message = "CPU util: {}% -- Individual utils 25-48: {}".format(
        cpu_util, cpu_util_ind[24:])
    write_to_log(log_file, (ts, key, message))

    # log GPU util and memory
    try:
        max_gpu_util = 0
        deviceCount = pynvml.nvmlDeviceGetCount()
        for idx in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
            board_num = pynvml.nvmlDeviceGetBoardId(handle)
            name = "GPU {}: {}  (ID {})".format(
                idx,
                pynvml.nvmlDeviceGetName(handle).decode("utf-8"), board_num)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            fan_util = pynvml.nvmlDeviceGetFanSpeed(handle)
            pcie_counter = pynvml.nvmlDeviceGetPcieReplayCounter(handle)
            pcie_util = pynvml.nvmlDeviceGetPcieThroughput(
                handle, pcie_counter)
            gpu_util = util.gpu
            mem_util = util.memory

            message = "{}: Kernel:{}%  Mem:{}% Fan:{}% PCIe: {}MB/s".format(
                name, gpu_util, mem_util, fan_util, round(pcie_util / 1000, 1))
            ts = time.time()
            key = "INFO"
            write_to_log(log_file, (ts, key, message))

            if gpu_util > max_gpu_util:
                max_gpu_util = gpu_util

    except pynvml.NVMLError as error:
        print(error)

    # log memory util
    mem_util = psutil.virtual_memory()
    used = round(mem_util.used / 1e+9, 2)
    total = round(mem_util.total / 1e+9, 2)
    ts = time.time()
    key = "INFO"
    message = "Memory util: {}%  ({}/{}GB)".format(
        round(used / total * 100, 2), used, total)
    write_to_log(log_file, (ts, key, message))

    pid_statuses = []
    warning = False
    if process_pids is not None:
        for key in process_pids:
            pid = process_pids[key]

            try:
                os.kill(pid, 0)
                RUNNING = "running"
            except OSError:
                RUNNING = "stopped"
                warning = True

            pid_statuses.append("{} ({}): {}\n".format(key, pid, RUNNING))

        ts = time.time()
        key = "INFO"
        if warning:
            key = "WARNING"
        write_to_log(log_file, (ts, key, pid_statuses))

    last_log_time = time.time()
    return last_log_time, max_gpu_util