Example #1
0
def auto_select_gpu():
  """Select gpu which has largest free memory"""
  if HAS_NVML:
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    largest_free_mem = 0
    largest_free_idx = 0
    for i in range(deviceCount):
      handle = pynvml.nvmlDeviceGetHandleByIndex(i)
      info = pynvml.nvmlDeviceGetMemoryInfo(handle)
      if info.free > largest_free_mem:
        largest_free_mem = info.free
        largest_free_idx = i
    pynvml.nvmlShutdown()
    largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

    idx_to_gpu_id = {}
    for i in range(deviceCount):
      idx_to_gpu_id[i] = '{}'.format(i)

    gpu_id = idx_to_gpu_id[largest_free_idx]
    logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
    return gpu_id
  else:
    logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
    return '0'
Example #2
0
def parse_cmdln():
    parser = get_args()
    args = parser.parse_args()
    if args.program == 'gpu_temp':

        assertion(nvmlInit,
                  ImportError('nvidia-ml-py is required for this program.'))

        assertion(mpl,
                  ImportError('matplotlib is required for this program.'))

        assertion(args.deviceID,
                  AssertionError('GPU index must be declared.'))

        nvmlInit()
        args.handle = nvmlDeviceGetHandleByIndex(args.deviceID)

    if args.program == 'cpu_usage':

        assertion(psutil,
                  ImportError('psutil is required for this program.'))

    if args.program == 'screen_glow':

        assertion(PIL,
                  ImportError('PIL is required for this program.'))

    return args
Example #3
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return

        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")

        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)

            if not self.exp.meta_data.get('test_state'):
                for i in range(count):
                    handle = pv.nvmlDeviceGetHandleByIndex(i)
                    if pv.nvmlDeviceGetComputeRunningProcesses(handle):
                        raise Exception("Unfortunately, GPU %i is busy. Try \
                            resubmitting the job to the queue." % i)
        except Exception as e:
            raise Exception("Unable to run GPU plugins: %s", e.message)
        self.__set_gpu_processes(count)
Example #4
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Example #5
0
def get_gpu_mem_used():
    try:
        from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        totalMemory = nvmlDeviceGetMemoryInfo(handle)
        return totalMemory.used
    except Exception:
        return -1
Example #6
0
 def get_handles(self):
     """ Return all listed Nvidia handles """
     if IS_MACOS:
         self.handles = pynvx.cudaDeviceGetHandles(ignore=True)
     else:
         self.handles = [pynvml.nvmlDeviceGetHandleByIndex(i)
                         for i in range(self.device_count)]
     if self.logger:
         self.logger.debug("GPU Handles found: %s", len(self.handles))
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Example #8
0
def request_mem(mem_mb, i_am_nice=True):
    # titanx' mem:        12,881,559,552 bytes
    # 12*1024*1024*1024 = 12,884,901,888
    mem = mem_mb * 1024 * 1024
    nvml.nvmlInit()
    # n = nvml.nvmlDeviceGetCount()
    try:
        handle = nvml.nvmlDeviceGetHandleByIndex(0)
        info   = nvml.nvmlDeviceGetMemoryInfo(handle)
        cap = info.total * nice_ratio
        # req = cap if mem > cap and i_am_nice else mem
        req = mem
        if req > cap and i_am_nice:
            raise MemoryError('You are supposed to be polite..')
        if req > info.free:
            raise MemoryError('Cannot fullfil the gpumem request')
        return req / info.free
    finally:
        nvml.nvmlShutdown()
Example #9
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Example #10
0
    def _main_func():
      try:
        # first get name
        import torch as th
        import os
      except:
        self.P("ERROR: PyTorch not installed! Please install Pytorch.")
        return None

      nvsmires = None
      try:
        from pynvml.smi import nvidia_smi
        import pynvml
        nvsmi = nvidia_smi.getInstance()
        nvsmires = nvsmi.DeviceQuery('memory.free, memory.total, memory.used, utilization.gpu, temperature.gpu')
        pynvml_avail = True
      except:
        pynvml_avail = False

      lst_inf = []
      # now we iterate all devices
      n_gpus = th.cuda.device_count()
      if n_gpus > 0:
        th.cuda.empty_cache()
      current_pid_has_usage = False
      current_pid_gpus = []

      try:
        for device_id in range(n_gpus):
          dct_device = {}
          device_props = th.cuda.get_device_properties(device_id)
          dct_device['NAME'] = device_props.name
          dct_device['TOTAL_MEM'] = round(
            device_props.total_memory / 1024 ** (2 if mb else 3),
            2
          )
          mem_total = None
          mem_allocated = None
          gpu_used = None
          gpu_temp = None
          gpu_temp_max = None
          if pynvml_avail and nvsmires is not None and 'gpu' in nvsmires:
            dct_gpu = nvsmires['gpu'][device_id]
            mem_total = round(
              dct_gpu['fb_memory_usage']['total'] / (1 if mb else 1024),
              2
            )  # already from th
            mem_allocated = round(
              dct_gpu['fb_memory_usage']['used'] / (1 if mb else 1024),
              2
            )
            gpu_used = dct_gpu['utilization']['gpu_util']
            if isinstance(gpu_used, str):
              gpu_used = -1
            gpu_temp = dct_gpu['temperature']['gpu_temp']
            gpu_temp_max = dct_gpu['temperature']['gpu_temp_max_threshold']

            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            processes = []
            for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
              dct_proc_info = {k.upper(): v for k,v in proc.__dict__.items()}
              used_mem = dct_proc_info.pop('USEDGPUMEMORY', None)
              dct_proc_info['ALLOCATED_MEM'] = round(
                used_mem / 1024 ** (2 if mb else 3) if used_mem is not None else 0.0,
                2
              )
              processes.append(dct_proc_info)
              if dct_proc_info['PID'] == os.getpid():
                current_pid_has_usage = True
                current_pid_gpus.append(device_id)
            #endfor
            dct_device['PROCESSES'] = processes
            dct_device['USED_BY_PROCESS'] = device_id in current_pid_gpus
          else:
            str_os = platform.platform()
            ## check if platform is Tegra and record
            if 'tegra' in str_os.lower():
              # we just record the overall fre memory
              mem_total = self.get_machine_memory()
              mem_allocated = mem_total  - self.get_avail_memory()
              gpu_used = 1
              gpu_temp = 1
              gpu_temp_max = 100
              if not self._done_first_smi_error and nvsmires is not None:
                self.P("Running `gpu_info` on Tegra platform: {}".format(nvsmires), color='r')
                self._done_first_smi_error = True
            elif not self._done_first_smi_error:
              str_log = "ERROR: Please make sure you have both pytorch and pynvml in order to monitor the GPU"
              str_log += "\nError info: pynvml_avail={}, nvsmires={}".format(pynvml_avail, nvsmires)
              self.P(str_log)
              self._done_first_smi_error = True
          #endif
          dct_device['ALLOCATED_MEM'] = mem_allocated
          dct_device['FREE_MEM'] = -1
          if all(x is not None for x in [mem_total, mem_allocated]):
            dct_device['FREE_MEM'] = round(mem_total - mem_allocated,2)
          dct_device['MEM_UNIT'] = 'MB' if mb else 'GB'
          dct_device['GPU_USED'] = gpu_used
          dct_device['GPU_TEMP'] = gpu_temp
          dct_device['GPU_TEMP_MAX'] = gpu_temp_max

          lst_inf.append(dct_device)
        #end for all devices
      except Exception as e:
        self.P("gpu_info exception for device_id {}:\n{}".format(device_id, e), color='r')

      if show:
        self.P("GPU information for {} device(s):".format(len(lst_inf)), color='y')
        for dct_gpu in lst_inf:
          for k, v in dct_gpu.items():
            self.P("  {:<14} {}".format(k + ':', v), color='y')

      if current_pid and current_pid_has_usage:
        return [lst_inf[x] for x in current_pid_gpus]
      else:
        return lst_inf
Example #11
0
    def myfuncHyper():
        import pynvml

        pynvml.nvmlInit()
        deviceCount = pynvml.nvmlDeviceGetCount()
        handle = pynvml.nvmlDeviceGetHandleByIndex(
            int(os.environ['CUDA_VISIBLE_DEVICES']))
        gpuMem = pynvml.nvmlDeviceGetMemoryInfo(handle)
        print("Init")
        print(gpuMem.used)

        dictionary = dict(hyperParams.iloc[paramNr])

        if not str(dictionary) in sizeDict:
            basicArchitecture = hyperParams.iloc[paramNr].basicArchitecture

            exec(
                open(basePath + 'model' + basicArchitecture[0] + '.py').read(),
                globals())
            continueComputations = False
            saveComputations = False
            #exec(open(basePath+'runEpochs'+basicArchitecture[0]+'.py').read(), globals())

            my_yDenseData = np.zeros((batchSize, nrOutputTargets))

            my_inputDropout = 0.2
            my_hiddenDropout = 0.5
            my_lrGeneral = 0.1
            my_lrWeight = 0.0
            my_lrBias = 0.0
            my_l2PenaltyWeight = 0.1
            my_l2PenaltyBias = 0.1
            my_l1PenaltyWeight = 0.1
            my_l1PenaltyBias = 0.1
            my_mom = 0.0
            my_biasInit = np.zeros(nrOutputTargets)
            my_is_training = True

            if nrDenseFeatures > 0:
                my_xDenseData = np.zeros((batchSize, nrDenseFeatures))

            if nrSparseFeatures > 0:
                indices = np.random.random_integers(0,
                                                    nrSparseFeatures - 1,
                                                    size=batchSize *
                                                    estNonZFeatures)
                indptr = np.random.random_integers(0,
                                                   len(indices) - 1,
                                                   size=batchSize)
                indptr.sort()
                indptr[0] = 0
                indptr = np.append(indptr, len(indices))
                data = np.random.random_integers(
                    0, 430000, size=batchSize * estNonZFeatures).astype(
                        np.float32) / 430000
                mycsr = scipy.sparse.csr_matrix((data, indices, indptr),
                                                (batchSize, nrSparseFeatures))
                mycsr.sort_indices()
                nonzx = mycsr.nonzero()
                valnonzx = (mycsr)[nonzx[0], nonzx[1]]

                my_xIndices = np.int64(np.vstack(nonzx).T)
                my_xValues = valnonzx.A.flatten()
                my_xDim = [mycsr.shape[0], mycsr.shape[1]]

                my_sparseMeanInit = np.zeros((1, nrSparseFeatures))

            myfeed = {
                yDenseData: my_yDenseData,
                inputDropout: my_inputDropout,
                hiddenDropout: my_hiddenDropout,
                lrGeneral: my_lrGeneral,
                lrWeight: my_lrWeight,
                lrBias: my_lrBias,
                l2PenaltyWeight: my_l2PenaltyWeight,
                l2PenaltyBias: my_l2PenaltyBias,
                l1PenaltyWeight: my_l1PenaltyWeight,
                l1PenaltyBias: my_l1PenaltyBias,
                mom: my_mom,
                biasInit: my_biasInit,
                is_training: my_is_training
            }

            if nrDenseFeatures > 0:
                myfeed.update({xDenseData: my_xDenseData})

            if nrSparseFeatures > 0:
                myfeed.update({
                    xIndices: my_xIndices,
                    xValues: my_xValues,
                    xDim: my_xDim,
                    sparseMeanInit: my_sparseMeanInit,
                })

            _ = session.run([init])
            if nrSparseFeatures > 0:
                _ = session.run([sparseMeanInitOp], feed_dict=myfeed)
                _ = session.run([sparseMeanWSparseOp])
            _ = session.run([optimizerDense], feed_dict=myfeed)
            _ = session.run([predNetwork], feed_dict=myfeed)

            print("GPU")
            gpuMem = pynvml.nvmlDeviceGetMemoryInfo(handle)
            sizeDict[str(dictionary)] = gpuMem.used
            sizeArray[paramNr] = gpuMem.used
            print(gpuMem.used)
        else:
            sizeArray[paramNr] = sizeDict[str(dictionary)]
if not os.path.exists(Cfg.modelDir):
    os.mkdir(Cfg.modelDir)
if not os.path.exists(Cfg.logDir):
    os.mkdir(Cfg.logDir)
if not os.path.exists(Cfg.dataDir):
    os.mkdir(Cfg.dataDir)

# Fix the training devices and random seed.
if torch.cuda.is_available():
    np.random.seed(Cfg.seed)
    torch.cuda.manual_seed(Cfg.seed)
    if Cfg.GPUID > -1:
        torch.cuda.set_device(Cfg.GPUID)
        # Get the GPU logger.
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(Cfg.GPUID)
    device = 'cuda'
else:
    np.random.seed(Cfg.seed)
    torch.manual_seed(Cfg.seed)
    device = 'cpu'

# Set the parameters of the Lee Oscillator for tanh.
if Cfg.LeeTanhType == 'A' or Cfg.LeeTanhType == 'a':
    a = [0.6, 0.6, -0.5, 0.5, -0.6, -0.6, -0.5, 0.5]
elif Cfg.LeeTanhType == 'B' or Cfg.LeeTanhType == 'b':
    a = [1, 1, 1, 1, -1, -1, -1, -1]
elif Cfg.LeeTanhType == 'C' or Cfg.LeeTanhType == 'c':
    a = [0.55, 0.55, -0.5, 0.5, -0.55, -0.55, 0.5, -0.5]
elif Cfg.LeeTanhType == 'D' or Cfg.LeeTanhType == 'd':
    a = [1, 1, 1, 1, -1, -1, -1, -1]
Example #13
0
    def setup(self):
        class TimeOutException(Exception):
            pass

        def alarm_handler(signum, frame):
            raise TimeOutException()

        self.data["root"] = os.getcwd()
        program = os.getenv(env.PROGRAM) or util.get_program()
        if program:
            self.data["program"] = program
        else:
            self.data["program"] = '<python with no main file>'
            if wandb._get_python_type() != "python":
                if os.getenv(env.NOTEBOOK_NAME):
                    self.data["program"] = os.getenv(env.NOTEBOOK_NAME)
                else:
                    meta = wandb.jupyter.notebook_metadata()
                    if meta.get("path"):
                        if "fileId=" in meta["path"]:
                            self.data[
                                "colab"] = "https://colab.research.google.com/drive/" + meta[
                                    "path"].split("fileId=")[1]
                            self.data["program"] = meta["name"]
                        else:
                            self.data["program"] = meta["path"]
                            self.data["root"] = meta["root"]

        if not os.getenv(env.DISABLE_CODE):
            logger.debug("code probe starting")
            in_jupyter = wandb._get_python_type() != "python"
            # windows doesn't support alarm() and jupyter could call this in a thread context
            if platform.system() == "Windows" or not hasattr(
                    signal, 'SIGALRM') or in_jupyter:
                logger.debug("non time limited probe of code")
                self._setup_code_git()
                self._setup_code_program()
            else:
                old_alarm = None
                try:
                    try:
                        old_alarm = signal.signal(signal.SIGALRM,
                                                  alarm_handler)
                        signal.alarm(25)
                        self._setup_code_git()
                        self._setup_code_program()
                    finally:
                        signal.alarm(0)
                except TimeOutException:
                    logger.debug("timeout waiting for setup_code")
                finally:
                    if old_alarm:
                        signal.signal(signal.SIGALRM, old_alarm)
            logger.debug("code probe done")

        self.data["startedAt"] = datetime.utcfromtimestamp(
            wandb.START_TIME).isoformat()
        try:
            username = getpass.getuser()
        except KeyError:
            # getuser() could raise KeyError in restricted environments like
            # chroot jails or docker containers.  Return user id in these cases.
            username = str(os.getuid())

        # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users.
        if self._api.settings().get('anonymous') != 'true':
            self.data["host"] = os.environ.get(env.HOST, socket.gethostname())
            self.data["username"] = os.getenv(env.USERNAME, username)
            self.data["executable"] = sys.executable
        else:
            self.data.pop("email", None)
            self.data.pop("root", None)

        self.data["os"] = platform.platform(aliased=True)
        self.data["python"] = platform.python_version()

        if env.get_docker():
            self.data["docker"] = env.get_docker()
        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass
        # TODO: we should use the cuda library to collect this
        if os.path.exists("/usr/local/cuda/version.txt"):
            with open("/usr/local/cuda/version.txt") as f:
                self.data["cuda"] = f.read().split(" ")[-1].strip()
        self.data["args"] = sys.argv[1:]
        self.data["state"] = "running"
Example #14
0
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%"+"(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Example #15
0
def device_name():
    with pynvml_context():
        device_name = device_name_for(pynvml.nvmlDeviceGetHandleByIndex(0))
        return device_name
Example #16
0
    def get(self):
        """Write the web page content."""
        global cpu_load
        global gpu_load_compute
        global gpu_load_memory
        memory = psutil.virtual_memory()
        swap = psutil.swap_memory()
        if nvidia:
            nvmlHandle = nvmlDeviceGetHandleByIndex(0)
            gpu = nvmlDeviceGetName(nvmlHandle).decode('utf-8')
            gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle)
            gpu_ram = round(gpu_memory.total / (1024 * 1048576), 2)
            gpu += " - " + str(gpu_ram) + "GB"
        else:
            gpu = "Not recognized"
        ram = str(int(round(float(memory.total) / (1024 * 1048576)))) + "GB"
        ram += " (swap: " + str(int(round(float(swap.total) / (1024 * 1048576)))) + "GB)"
        real_cores = psutil.cpu_count(False)
        cores_ratio = int(psutil.cpu_count(True) / real_cores)
        cores = " (" + str(cores_ratio) + "x " + str(real_cores) + " cores)"
        if sys.platform.startswith('linux'):
            distribution = distro.linux_distribution()
            os_name = 'Linux ' + distribution[0] + " " + distribution[1] + " " + distribution[2]
            command = "cat /proc/cpuinfo"
            all_info = subprocess.check_output(command, shell=True).decode('utf-8').strip()
            for line in all_info.split("\n"):
                if "model name" in line:
                    cpu = re.sub(".*model name.*:", "", line, 1)
                    break
        elif sys.platform == 'win32':
            computer = wmi.WMI()
            os_info = computer.Win32_OperatingSystem()[0]
            cpu = computer.Win32_Processor()[0].Name
            os_name = os_info.Name.split('|')[0] + ", version " + os_info.Version
        elif sys.platform == 'darwin':
            os_name = 'macOS ' + platform.mac_ver()[0]
            os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
            command = 'sysctl -n machdep.cpu.brand_string'
            cpu = subprocess.check_output(command).strip()
        else:  # unknown platform
            os_name = 'Unknown'
            cpu = 'Unknown'
        self.write("<!DOCTYPE html>\n")
        self.write("<html><head><meta charset='utf-8'/><title>Webots simulation server</title>")
        self.write("<link rel='stylesheet' type='text/css' href='css/monitor.css'></head>\n")
        self.write("<body><h1>Webots simulation server: " + socket.getfqdn() + "</h1>")
        self.write("<h2>Host: " + os_name + "</h2>\n")
        self.write("<p><b>CPU load: %g%%</b><br>\n" % cpu_load)
        self.write(cpu + cores + "</p>\n")
        self.write("<p><b>GPU load compute: %g%% &mdash; load memory: %g%%</b><br>\n" %
                   (gpu_load_compute, gpu_load_memory))
        self.write(gpu + "</p>\n")
        self.write("<p><b>RAM:</b><br>" + ram + "</p>\n")
        self.write("<canvas id='graph' height='400' width='1024'></canvas>\n")
        self.write("<script src='https://www.cyberbotics.com/harry-plotter/0.9f/harry.min.js'></script>\n")
        self.write("<script>\n")
        self.write("window.onload = function() {\n")

        def appendData(label):
            global snapshots
            d = "{title:'" + label + "',values:["
            for s in snapshots:
                d += str(s.data[label]) + ','
            return d[:-1] + "]},"

        datas = ''
        datas += appendData('Webots running')
        datas += appendData('Webots idle')
        datas += appendData('CPU load')
        datas += appendData('CPU memory')
        datas += appendData('GPU load compute')
        datas += appendData('GPU load memory')
        datas += appendData('GPU memory')
        datas += appendData('Swap')
        datas += appendData('Disk')
        datas += appendData('Network sent')
        datas += appendData('Network received')

        datas = datas[:-1]  # remove the last coma
        self.write("  plotter({\n")
        self.write("    canvas: 'graph',\n")
        self.write("    datas:[ " + datas + "],\n")
        self.write("""
     labels:{
        ypos:"left",
        x:100,
        y:[50,100],
        marks:2
     },
     fill:"none",
     opacity:0.5,
     linewidth:3,
     background:"#fff",
     autoscale:"top",
     grid:{
        x:[0,100]
     },
     mouseover:{
        radius:4,
        linewidth:2,
        bullet:"#444",
        shadowbox:"1,1,0,#000",
        axis:"x"
     }
  });""")
        self.write("}\n")
        self.write("</script>\n")
        self.write("</body></html>")
Example #17
0
def update_snapshot():
    """Compute a monitoring snapshot."""
    global current_load
    global network_sent
    global network_received
    global cpu_load
    global gpu_load_compute
    global gpu_load_memory
    memory = psutil.virtual_memory()
    swap = psutil.swap_memory()
    disk = psutil.disk_usage('/')
    n = psutil.net_io_counters()
    new_network_sent = n.bytes_sent
    new_network_received = n.bytes_recv
    network_sent_rate = float(new_network_sent - network_sent) / (SNAPSHOT_REFRESH * 1000000)  # expressed in MB/s
    network_received_rate = float(new_network_received - network_received) / (SNAPSHOT_REFRESH * 1000000)  # MB/s
    network_sent = new_network_sent
    network_received = new_network_received
    global nvidia
    if nvidia:
        nvmlHandle = nvmlDeviceGetHandleByIndex(0)
        gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle)
        gpu_ram_usage = round(100 * float(gpu_memory.used) / float(gpu_memory.total), 1)
    else:  # not supported
        nvmlHandle = 0
        gpu_ram_usage = 0
    cpu_load = psutil.cpu_percent()
    try:
        gpu_load = nvmlDeviceGetUtilizationRates(nvmlHandle)
        gpu_load_compute = gpu_load.gpu
        gpu_load_memory = gpu_load.memory
    except NVMLError:  # not supported on some hardware
        gpu_load_compute = 0
        gpu_load_memory = 0
    webots_idle = 0
    webots_running = 0
    for client in ClientWebSocketHandler.clients:
        if client.idle:
            webots_idle = webots_idle + 1
        else:
            webots_running = webots_running + 1
    snapshot = Snapshot()
    snapshot.data['Timestamp'] = int(time.time())
    snapshot.data['Webots running'] = webots_running
    snapshot.data['Webots idle'] = webots_idle
    snapshot.data['CPU load'] = cpu_load
    snapshot.data['CPU memory'] = memory.percent
    snapshot.data['GPU load compute'] = gpu_load_compute
    snapshot.data['GPU load memory'] = gpu_load_memory
    snapshot.data['GPU memory'] = gpu_ram_usage
    snapshot.data['Swap'] = swap.percent
    snapshot.data['Disk'] = disk.percent
    snapshot.data['Network sent'] = network_sent_rate
    snapshot.data['Network received'] = network_received_rate
    snapshot.write()
    current_load = 0
    for key, value in snapshot.data.items():
        if key == 'Timestamp':
            continue
        if value > current_load:
            current_load = value
    snapshots.append(snapshot)
    if len(snapshots) > 600:  # display data for the last 10 minutes
        del snapshots[0]
    tornado.ioloop.IOLoop.current().add_timeout(int(time.time()) + SNAPSHOT_REFRESH, update_snapshot)
    def step(self):
        valuesDict = {}
        valuesDict['table'] = self._tableName
        cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0)
        mem = valuesDict['mem'] = psutil.virtual_memory().percent
        swap = valuesDict['swap'] = psutil.swap_memory().percent
        # some code examples:
        # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py
        if self.doGpu:
            for i in self.gpusToUse:
                try:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    memInfo = nvmlDeviceGetMemoryInfo(handle)
                    valuesDict["gpuMem_%d" % i] = \
                        float(memInfo.used)*100./float(memInfo.total)
                    util = nvmlDeviceGetUtilizationRates(handle)
                    valuesDict["gpuUse_%d" % i] = util.gpu
                    temp = nvmlDeviceGetTemperature(handle,
                                                    NVML_TEMPERATURE_GPU)
                    valuesDict["gpuTem_%d" % i] = temp
                except NVMLError as err:
                    handle = nvmlDeviceGetHandleByIndex(i)
                    msg = "Device %d -> %s not suported\n" \
                          "Remove device %d from FORM" % \
                          (i, nvmlDeviceGetName(handle), i)
                    errorWindow(None, msg)
        if self.doNetwork:
            try:
                # measure a sort interval
                pnic_before = psutil.net_io_counters(pernic=True)[self.nif]
                time.sleep(self.samplingTime)  # sec
                pnic_after = psutil.net_io_counters(pernic=True)[self.nif]
                bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent
                bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv
                valuesDict["%s_send" % self.nif] = \
                    bytes_sent * self.samplingTime / 1048576
                valuesDict["%s_recv" % self.nif] = \
                    bytes_recv * self.samplingTime / 1048576
            except:
                msg = "cannot get information of network interface %s" % \
                      self.nif

        if self.doDiskIO:
            try:
                # measure a sort interval
                disk_before = psutil.disk_io_counters(perdisk=False)
                time.sleep(self.samplingTime)  # sec
                disk_after = psutil.disk_io_counters(perdisk=False)
                bytes_read = disk_after.read_bytes - disk_before.read_bytes
                bytes_write = disk_after.write_bytes - disk_before.write_bytes
                valuesDict["disk_read"] = \
                    self.samplingTime * bytes_read / self.mega
                valuesDict["disk_write"] = \
                    self.samplingTime * bytes_write / self.mega
            except:
                msg = "cannot get information of disk usage "

        if self.cpuAlert < 100 and cpu > self.cpuAlert:
            self.warning("CPU allocation =%f." % cpu)
            self.cpuAlert = cpu

        if self.memAlert < 100 and mem.percent > self.memAlert:
            self.warning("Memory allocation =%f." % mem)
            self.memAlert = mem

        if self.swapAlert < 100 and swap.percent > self.swapAlert:
            self.warning("SWAP allocation =%f." % swap)
            self.swapAlert = swap

        sqlCommand = "INSERT INTO %(table)s ("
        for label in self.labelList:
            sqlCommand += "%s, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ") VALUES("
        for label in self.labelList:
            sqlCommand += "%" + "(%s)f, " % label
        # remove last comma
        sqlCommand = sqlCommand[:-2]
        sqlCommand += ");"

        sql = sqlCommand % valuesDict

        try:
            self.cur.execute(sql)
        except Exception as e:
            print("ERROR: saving one data point (monitor). I continue")

        # Return finished = True if all protocols have finished
        finished = []
        for prot in self.protocols:
            updatedProt = getUpdatedProtocol(prot)
            finished.append(updatedProt.getStatus() != STATUS_RUNNING)

        return all(finished)
Example #19
0
def printGPUINFO():
    gpu_id = config.GPU_ID
    gpu_obj = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
    print ("gup mem used:", pynvml.nvmlDeviceGetMemoryInfo(gpu_obj).used/1024/1024, "MB")
Example #20
0
 def getFreeRatio(id):
     handle = pynvml.nvmlDeviceGetHandleByIndex(id)
     use = pynvml.nvmlDeviceGetUtilizationRates(handle)
     ratio = 0.5*(float(use.gpu+float(use.memory)))
     return ratio
def lane_attn():
    # data read
    x_train = np.load(r'./data_split/x_train.npy')
    x_test = np.load(r'./data_split/x_test.npy')
    x_validation = np.load(r'./data_split/x_validation.npy')
    y_train = np.load(r'./data_split/y_train.npy')
    y_test = np.load(r'./data_split/y_test.npy')
    y_validation = np.load(r'./data_split/y_validation.npy')

    #data standard normalization
    a, b, c = x_train.shape
    x_train = x_train.reshape(a * b, c)
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_train = x_train.reshape(a, b, c)

    a, b, c = x_validation.shape
    x_validation = x_validation.reshape(a * b, c)
    x_validation = scaler.transform(x_validation)
    x_validation = x_validation.reshape(a, b, c)

    a, b, c = x_test.shape
    x_test = x_test.reshape(a * b, c)
    x_test = scaler.transform(x_test)
    x_test = x_test.reshape(a, b, c)

    x1 = torch.from_numpy(x_train).float()
    y1 = torch.from_numpy(y_train).float()
    x2 = torch.from_numpy(x_validation).float()
    y2 = torch.from_numpy(y_validation).float()
    x3 = torch.from_numpy(x_test).float()
    y3 = torch.from_numpy(y_test).float()

    #data from.npy to pytorch data

    global BATCH_SIZE
    BATCH_SIZE = 512

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = Data.TensorDataset(x1, y1)

    trainloader = Data.DataLoader(
        dataset=train_dataset,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=False,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多线程来读数据
        drop_last=True,
    )

    vali_dataset = Data.TensorDataset(x2, y2)

    valiloader = Data.DataLoader(
        dataset=vali_dataset,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=False,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多线程来读数据
        drop_last=True,
    )

    test_dataset = Data.TensorDataset(x3, y3)

    testloader = Data.DataLoader(
        dataset=test_dataset,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=False,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多线程来读数据
        drop_last=True,
    )

    #Encoder structure
    class Encoder(nn.Module):
        def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout=0):
            #n_layers represents the layer of LSTM

            super().__init__()

            self.input_dim = input_dim
            self.emb_dim = emb_dim
            self.hid_dim = hid_dim
            self.n_layers = n_layers
            self.dropout = dropout

            self.embedding = nn.Linear(input_dim, emb_dim)

            self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

            self.dropout = nn.Dropout(dropout)

        def forward(self, x):

            #x = [src sent len, batch size,2]

            embedded = self.dropout(self.embedding(x))

            #embedded = [src sent len, batch size, emb dim]

            outputs, (hidden, cell) = self.rnn(embedded)

            #outputs = [len, batch size, hid dim * n directions]
            #direction==1 hier
            #hidden = [n layers * n directions, batch size, hid dim]
            #cell = [n layers * n directions, batch size, hid dim]

            #outputs are always from the top hidden layer

            return outputs, hidden, cell

#Decoder structure

    class Decoder(nn.Module):
        def __init__(self,
                     decoder_input_dim,
                     emb_dim,
                     hid_dim,
                     n_layers,
                     dropout=0.2):
            super().__init__()

            self.emb_dim = emb_dim
            self.hid_dim = hid_dim
            self.decoder_input_dim = decoder_input_dim
            self.n_layers = n_layers
            self.dropout = dropout

            self.embedding = nn.Linear(decoder_input_dim, emb_dim)

            self.attn = nn.Linear(40 + self.emb_dim, 1)
            #         self.attn_combine = nn.Linear(self.hidden_size+self.de_emb_dim , self.hidden_size)

            self.rnn = nn.LSTM(emb_dim + hid_dim,
                               hid_dim,
                               n_layers,
                               dropout=dropout)

            self.out = nn.Linear(hid_dim, decoder_input_dim)

            self.dropout = nn.Dropout(dropout)

        def forward(self, input, context1, context2, context3, hidden, cell):

            #hidden = [n layers * n directions, batch size, hid dim]
            #cell = [n layers * n directions, batch size, hid dim]

            #n directions in the decoder will both always be 1, therefore:
            #hidden = [n layers, batch size, hid dim]
            #context = [n layers, batch size, hid dim]
            input = input.unsqueeze(0)
            embedded = self.dropout(self.embedding(input))
            context1 = context1.permute(1, 0, 2)
            context2 = context2.permute(1, 0, 2)
            context3 = context3.permute(1, 0, 2)
            #         context=[batch,1,hidden_dim]
            embedded = embedded.permute(1, 0, 2)

            #          embedded[batch,1,embedded_dim]
            attn1 = self.attn(torch.cat((embedded, context1), dim=2))
            attn2 = self.attn(torch.cat((embedded, context2), dim=2))
            attn3 = self.attn(torch.cat((embedded, context3), dim=2))

            #         attn1=np.array(attn1)
            attn1 = attn1.squeeze(1)
            attn2 = attn2.squeeze(1)
            attn3 = attn3.squeeze(1)
            #print(attn1.size())
            attn = torch.cat((attn1, attn2, attn3), 1)
            #print(attn.size())
            attn_weights = F.softmax(attn, dim=1)
            #print(attn_weights.size())
            #print(attn_weights[200,:])
            attn_weights = attn_weights.unsqueeze(1)
            context1 = context1.permute(0, 2, 1)
            context2 = context2.permute(0, 2, 1)
            context3 = context3.permute(0, 2, 1)

            #print(context1.shape)
            #print(attn_weights[:,:,0].shape)
            context1 = torch.bmm(context1, attn_weights[:, :, 0].unsqueeze(2))
            context2 = torch.bmm(context2, attn_weights[:, :, 1].unsqueeze(2))
            context3 = torch.bmm(context3, attn_weights[:, :, 2].unsqueeze(2))

            context1 = context1.permute(0, 2, 1)
            context2 = context2.permute(0, 2, 1)
            #permute change the dimesion between the last and the middel positiom
            context3 = context3.permute(0, 2, 1)

            context = torch.cat((context1, context2, context3), dim=2)
            #print(context.shape)

            embedded = embedded.permute(1, 0, 2)
            context = context.permute(1, 0, 2)

            emb_con = torch.cat((embedded, context), 2)

            output, (hidden, cell) = self.rnn(emb_con, (hidden, cell))

            #output = [len, batch size, hid dim * n directions]
            #hidden = [n layers * n directions, batch size, hid dim]
            #cell = [n layers * n directions, batch size, hid dim]

            #sent len and n directions will always be 1 in the decoder, therefore:
            #output = [1, batch size, hid dim]
            #hidden = [n layers, batch size, hid dim]
            #cell = [n layers, batch size, hid dim]

            prediction = self.out(output.squeeze(0))

            #prediction = [batch size, output dim]

            return prediction, hidden, cell

# seq-seq combine encoder-decoder and
#the decoder process is done step by step

    class Seq2Seq(nn.Module):
        global firstinput

        def __init__(self, encoder, decoder, device):
            super().__init__()

            self.encoder = encoder

            self.decoder = decoder
            self.device = device

            #assert encoder.hid_dim == decoder.hid_dim,             "Hidden dimensions of encoder and decoder must be equal!"
            assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!"

        def forward(self, x, y, teacher_forcing_ratio=0.5):

            #teacher_forcing_ratio is probability to use teacher forcing
            #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

            batch_size = BATCH_SIZE
            max_len = 25
            trg_vocab_size = 2

            #tensor to store decoder outputs
            outputs = torch.zeros(max_len, batch_size, trg_vocab_size)

            #last hidden state of the encoder is used as the initial hidden state of the decoder
            #choose the cell state of second layer as context
            encoder_outputs1, hidden1, cell1 = self.encoder(
                x[:, :, [0, 1, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31]])
            encoder_outputs2, hidden2, cell2 = self.encoder(
                x[:, :, [8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35]])
            encoder_outputs3, hidden3, cell3 = self.encoder(
                x[:, :, [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]])
            #take the cell of last layer as context
            context1 = cell1[1, :, :]
            context1 = context1.unsqueeze(0)
            context2 = cell2[1, :, :]
            context2 = context2.unsqueeze(0)
            context3 = cell3[1, :, :]
            context3 = context3.unsqueeze(0)
            hidden = torch.cat((hidden1, hidden2, hidden3), dim=2)
            cell = torch.cat((cell1, cell2, cell3), dim=2)
            #cell=cell1+cell2+cell3
            #         print('c-shape:',context.shape)

            input = firstinput
            #the firstinput of decoder process, we should give the true vaule
            # input = input.unsqueeze(0)
            #print(input.size())
            for t in range(max_len):

                output, hidden, cell = self.decoder(input, context1, context2,
                                                    context3, hidden, cell)
                outputs[t] = output
                #print(output)
                #input = output.unsqueeze(0)
                #print(input.size())
                #context=cell[1,:,:]
                #context=context.unsqueeze(0)
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = output
                if t == 24:
                    break
                input = ((y[t, :, :]) if teacher_force else top1)
                #outputs[t] = output
                #print('output',output.size())
                #input = output.unsqueeze(0)

            return outputs

    INPUT_DIM = 12
    DECODER_INPUT_DIM = 2
    HID_DIM = 40
    HID_DIM1 = 120
    N_LAYERS = 2
    ENC_EMB_DIM = 32
    DEC_EMB_DIM = 16

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS)
    dec = Decoder(DECODER_INPUT_DIM, DEC_EMB_DIM, HID_DIM1, N_LAYERS)

    model = Seq2Seq(enc, dec, device).to(device)

    def init_weights(m):
        for name, param in m.named_parameters():
            nn.init.uniform_(param.data, -0.15, 0.15)

    #         nn.init.orthogonal_(param.data)

    model.apply(init_weights)

    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f'The model has {count_parameters(model):,} trainable parameters')

    optimizer = optim.Adam(model.parameters(), weight_decay=0.00001, lr=0.01)
    criterion = nn.MSELoss()

    def train(model, dataloader, optimizer, criterion, clip):
        global firstinput
        model.train()

        epoch_loss = 0

        for x, y in dataloader:

            x = x.transpose(1, 0)
            y = y.transpose(1, 0)
            x = x.to('cuda')
            y = y.to('cuda')
            firstinput = y[0, :, :]
            y = y[1:, :, :]
            optimizer.zero_grad()

            output = model(x, y)
            output = output.to('cuda')

            #         loss = criterion(output, y)
            #print(output.size())
            # for lateral position,we give more attention,so his penalization is *3
            loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion(
                output[:, :, 0], y[:, :, 0])
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()

            epoch_loss += loss.item()
            #print(epoch_loss)

        return epoch_loss / len(dataloader)

    # In[ ]:

    def evaluate(model, validataloader, criterion):

        model.eval()

        epoch_loss = 0
        # no loss backward,
        with torch.no_grad():

            for x, y in validataloader:

                x = x.transpose(1, 0)
                y = y.transpose(1, 0)
                x = x.to('cuda')
                y = y.to('cuda')
                firstinput = y[0, :, :]
                y = y[1:, :, :]
                optimizer.zero_grad()

                output = model(x, y, 0)  #turn off teacher forcing
                output = output.to('cuda')

                loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion(
                    output[:, :, 0], y[:, :, 0])
                epoch_loss += loss.item()

        return epoch_loss / len(validataloader)

    # In[ ]:

    def test(model, testdataloader, criterion):
        global j
        global firstinput
        global test_result
        model.eval()

        epoch_loss = 0

        with torch.no_grad():

            for x, y in testdataloader:

                x = x.transpose(1, 0)
                y = y.transpose(1, 0)
                x = x.to('cuda')
                y = y.to('cuda')
                firstinput = y[0, :, :]
                y = y[1:, :, :]
                optimizer.zero_grad()

                output = model(x, y, 0)  #turn off teacher forcing
                test_result[:, j:j + BATCH_SIZE, :] = output
                j = j + BATCH_SIZE
                output = output.to('cuda')

                #             loss = criterion(output, y)
                loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion(
                    output[:, :, 0], y[:, :, 0])
                epoch_loss += loss.item()

    #     print(len(testdataloader))

        return epoch_loss / len(testdataloader)

    # In[ ]:

    N_EPOCHS = 40
    CLIP = 1
    #CLIP clip the gradients to prevent them from exploding
    global test_result
    test_result = np.zeros([25, 80000, 2])
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print('this is lane-attention\n')
    for epoch in range(N_EPOCHS):
        global j
        j = 0
        start_time = time.process_time()
        train_loss = train(model, trainloader, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, valiloader, criterion)
        end_time = time.process_time()
        print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time}s')
        print(f'\tTrain Loss: {train_loss:.3f} |  Val. Loss: {valid_loss:.3f}')
        #writer.add_scalars('loss',{'train_loss': train_loss,
        #'valid_loss': valid_loss},epoch )
        test_loss = test(model, testloader, criterion)
        if test_loss < 1.7:
            print('testloss:', test_loss)
            test_result = test_result[:, :j, :]
            np.save(r'./result/lane_attn_predict_tra.npy', test_result)
            np.save(r'./result/true_tra.npy', y_test[:, 1:, :])
            break
        if epoch == 39:
            print('testloss:', test_loss)
            test_result = test_result[:, :j, :]
            np.save(r'./result/lane_attn_predict_tra.npy', test_result)
            np.save(r'./result/true_tra.npy', y_test[:, 1:, :])
            break
    print('meminfo.used:', meminfo.used / (1024 * 1024))
    print('meminfo.total:', meminfo.total / (1024 * 1024))

    return 0
    def __init__(self, ground_policy, F_s, F_sa, env, device, log,
                 hyperparameters):
        self.env = env
        self.device = device
        self.log = log
        self.hyperparameters = hyperparameters
        self.ground_policy = ground_policy
        self.name = ""
        self.verbose = hyperparameters["verbose"]

        # Check env:
        self.discrete_env = True if 'Discrete' in str(
            env.action_space) else False
        if self.discrete_env:
            self.num_actions = self.env.action_space.n
            self.action_low = torch.zeros(self.num_actions, device=self.device)
            self.action_high = torch.ones(self.num_actions, device=self.device)
            if self.verbose:
                print("Num actions: ", self.num_actions)
        else:
            self.num_actions = len(self.env.action_space.high)
            self.action_low = torch.tensor(env.action_space.low,
                                           device=self.device)
            self.action_high = torch.tensor(env.action_space.high,
                                            device=self.device)
            if self.verbose:
                print("Env action low: ", self.action_low)
                print("Env action high: ", self.action_high)

        # Set up parameters:
        # Actor-Critic:
        self.use_actor_critic = hyperparameters["use_actor_critic"]
        self.use_CACLA_V = hyperparameters["use_CACLA_V"]
        self.use_CACLA_Q = hyperparameters["use_CACLA_Q"]
        self.use_DDPG = hyperparameters["use_DDPG"]
        self.use_SPG = hyperparameters["use_SPG"]
        self.use_GISPG = hyperparameters["use_GISPG"]
        # QV:
        self.use_QV = hyperparameters["use_QV"]
        self.use_QVMAX = hyperparameters["use_QVMAX"]
        # Exploration:
        self.gaussian_action_noise = hyperparameters["action_sigma"]
        self.boltzmann_exploration_temp = hyperparameters["boltzmann_temp"]
        self.epsilon = hyperparameters["epsilon"]
        self.epsilon_mid = hyperparameters["epsilon_mid"]
        if self.epsilon_mid:
            self.eps_factor = self.epsilon_mid**(1 / hyperparameters["steps"])
            self.epsilon = 1
        # General:
        self.use_half = hyperparameters["use_half"]
        self.batch_size = hyperparameters["batch_size"]
        self.use_world_model = hyperparameters["use_world_model"]

        # TODO: -Include PER with prioritization based on Upper Bound of Gradient Norm.
        # TODO: -include different sampling schemes from the papers investigatin PER in SL (small and big buffer for gradient norm too)

        # TODO: -add goal to replay buffer and Transition (For HRL)
        # Eligibility traces:
        if torch.cuda.is_available():
            nvmlInit()
            self.nvml_handle = nvmlDeviceGetHandleByIndex(0)
            self.max_gpu_bytes = torch.cuda.get_device_properties(
                self.device).total_memory
        self.mem_usage = None
        self.current_episode = []
        self.use_efficient_traces = hyperparameters["use_efficient_traces"]
        self.elig_traces_update_steps = hyperparameters[
            "elig_traces_update_steps"]
        self.elig_traces_anneal_lambda = hyperparameters[
            "elig_traces_anneal_lambda"]
        self.lambda_val = hyperparameters["elig_traces_lambda"]
        # Set up replay buffer:
        self.stack_dim = hyperparameters["stack_dim"]
        self.stack_count = hyperparameters["frame_stack"]
        self.buffer_size = hyperparameters[
            "replay_buffer_size"] + hyperparameters["num_expert_samples"]
        self.use_PER = hyperparameters["use_PER"]
        self.use_CER = hyperparameters["use_CER"]
        self.PER_alpha = hyperparameters["PER_alpha"]
        self.PER_start_beta = hyperparameters["PER_beta"]
        self.PER_beta = self.PER_start_beta
        self.PER_anneal_beta = hyperparameters["PER_anneal_beta"]
        self.PER_max_priority = hyperparameters["PER_max_priority"]
        self.PER_running_avg = hyperparameters["PER_running_avg"]
        self.importance_weights = None

        # Create replay buffer:
        self.memory = self.create_replay_buffer()

        # Feature extractors:
        self.F_s = F_s
        self.F_sa = F_sa
        self.state_feature_len = F_s.layers_merge[-1].out_features
        if F_sa is not None:
            self.state_action_feature_len = F_sa.layers_merge[-1].out_features

        # Set up Networks:
        self.use_half = hyperparameters[
            "use_half"] and torch.cuda.is_available()
        self.nets = []
        self.actor, self.Q, self.V = self.init_actor_critic(
            self.F_s, self.F_sa)
Example #23
0
    def training_step(self, batch, batch_idx) -> Dict:

        global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
        global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
        global processes  # use to keep threads embedding update processes
        global threadHandle_index  # use to keep thread in embedding indexing processes

        if (self.trainer.global_rank == 0) and (self.custom_config.end2end):

            if (not batch_idx == 0) and (
                    batch_idx % self.custom_config.indexing_freq == 0):
                free_gpu_list = []
                nvmlInit()
                deviceCount = nvmlDeviceGetCount()

                my_list = json.loads(self.custom_config.gpu_order)

                for i in range(deviceCount):
                    handle = nvmlDeviceGetHandleByIndex(i)
                    info = nvmlDeviceGetMemoryInfo(handle)

                    if info.used / 1e6 < 15:
                        position = my_list.index(i)
                        free_gpu_list.append("cuda:" + str(position))

                if len(free_gpu_list) >= self.custom_config.index_gpus:
                    has_free_gpus = True

                else:
                    has_free_gpus = False

                if (not isEmUpdateBusy) and has_free_gpus:

                    model_copy = type(self.model.rag.ctx_encoder)(
                        self.config_dpr
                    )  # get a new instance  #this will be load in the CPU
                    model_copy.load_state_dict(self.model.rag.ctx_encoder.
                                               state_dict())  # copy weights

                    processes = []

                    if len(free_gpu_list) > self.custom_config.index_gpus:
                        cuda_devices = random.sample(
                            free_gpu_list, self.custom_config.index_gpus)
                    else:
                        cuda_devices = free_gpu_list

                    num_processes = len(cuda_devices)

                    for rank in range(num_processes):
                        logger.info(
                            "Iniitializing  embedding calculation process rank{}"
                            .format(rank))
                        device = cuda_devices[rank]
                        p = multiprocessing.Process(
                            target=embed_update,
                            args=(
                                copy.deepcopy(model_copy),
                                num_processes,
                                device,
                                rank,
                                self.custom_config.shard_dir,
                                self.custom_config.csv_path,
                            ),
                        )
                        processes.append(p)

                    for p in processes:
                        p.start()

                    isEmUpdateBusy = True

            if isEmUpdateBusy and (not isAddIndexBusy):
                index_process_list = [
                    processes[k].is_alive()
                    for k in range(self.custom_config.index_gpus)
                ]
                if (
                        sum(index_process_list) == 0
                ):  # If entire list is false, we can say all embedding calculation process has finished
                    logger.info("Start adding the index")
                    threadHandle_index = multiprocessing.Process(
                        target=add_index,
                        args=(
                            self.custom_config.shard_dir,
                            self.config.index_path,
                        ),
                    )
                    threadHandle_index.start()
                    isAddIndexBusy = True

            # check when index building has started
            if isAddIndexBusy:

                # check still the index_building process is happening
                if not threadHandle_index.is_alive():

                    logger.info("Merging the dataset shards")
                    saved_dataset_shards = []

                    for address in glob(
                            str(self.custom_config.shard_dir) + "/*/"):
                        saved_dataset_shards.append(load_from_disk(address))

                    concat = concatenate_datasets(saved_dataset_shards)
                    concat.save_to_disk(
                        self.config.passages_path
                    )  # here we update the main passage file on the disk
                    logger.info("done updating the dataset")

                    # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker.
                    # logger.info("then updating the index")
                    # shutil.copy(self.custom_config.temp_index, self.config.idex_path)

                    logger.info(
                        "Loading new passages and iniitalzing new index")
                    self.trainer.model.module.module.model.rag.retriever.re_load(
                    )
                    self.trainer.model.module.module.model.rag.retriever.init_retrieval(
                    )

                    isEmUpdateBusy = False
                    isAddIndexBusy = False

        self.trainer.accelerator_connector.accelerator.barrier(
            "barrier")  # waint untill the index and kb get re-initialized.

        loss_tensors = self._step(batch)

        logs = {
            name: loss
            for name, loss in zip(self.loss_names, loss_tensors)
        }
        # tokens per batch
        tgt_pad_token_id = (self.tokenizer.generator.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() +
                       batch["decoder_input_ids"].ne(tgt_pad_token_id).sum())
        self.log("loss", loss_tensors[0])
        return loss_tensors[0]
Example #24
0
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import DataRange1d, NumeralTickFormatter, BasicTicker
from bokeh.layouts import column
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import all_palettes

import math
import time

import pynvml

from jupyterlab_nvdashboard.utils import format_bytes

pynvml.nvmlInit()
ngpus = pynvml.nvmlDeviceGetCount()
gpu_handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(ngpus)]


def gpu(doc):
    fig = figure(title="GPU Utilization",
                 sizing_mode="stretch_both",
                 x_range=[0, 100])

    def get_utilization():
        return [
            pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
            for i in range(ngpus)
        ]

    gpu = get_utilization()
    y = list(range(len(gpu)))
Example #25
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Example #26
0
 def getFreeRatio(id):
     handle = pynvml.nvmlDeviceGetHandleByIndex(id)
     use = pynvml.nvmlDeviceGetUtilizationRates(handle)
     ratio = 0.5 * (float(use.gpu + float(use.memory)))
     return ratio
import utils.blob as blob_utils
import utils.net as net_utils
import utils.Lossfuction as Lossfuction
import utils.resnet_weights_helper as resnet_utils
from lib.nn import SynchronizedBatchNorm2d
import pynvml
import cv2
import modeling.CRL as CRL
from torchvision.utils import make_grid
from tensorboardX import SummaryWriter
from torch.utils.checkpoint import checkpoint
logger = logging.getLogger(__name__)


pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
def get_func(func_name):
    """Helper to return a function object by name. func_name must identify a
    function in this module or the path to a function relative to the base
    'modeling' module.
    """
    if func_name == '':
        return None
    try:
        parts = func_name.split('.')
        # Refers to a function in this module
        if len(parts) == 1:
            return globals()[parts[0]]
        # Otherwise, assume we're referencing a module under modeling
        module_name = 'modeling.' + '.'.join(parts[:-1])
Example #28
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                props = {}
                def meminfo(memory):
                    return {
                            "total"  : int(memory.total),
                            "free"   : int(memory.free),
                            "used"   : int(memory.used),
                            }
                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i
                for prop, fn_name, args, conv in (
                       ("name",                     "nvmlDeviceGetName",                    (),     str),
                       ("serial",                   "nvmlDeviceGetSerial",                  (),     str),
                       ("uuid",                     "nvmlDeviceGetUUID",                    (),     str),
                       ("pci",                      "nvmlDeviceGetPciInfo",                 (),     pciinfo),
                       ("memory",                   "nvmlDeviceGetMemoryInfo",              (),     meminfo),
                       ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration",   (),     int),
                       ("pcie-link-width-max",      "nvmlDeviceGetMaxPcieLinkWidth",        (),     int),
                       ("pcie-link-generation",     "nvmlDeviceGetCurrPcieLinkGeneration",  (),     int),
                       ("pcie-link-width",          "nvmlDeviceGetCurrPcieLinkWidth",       (),     int),
                       ("clock-info-graphics",      "nvmlDeviceGetClockInfo",               (0,),   int),
                       ("clock-info-sm",            "nvmlDeviceGetClockInfo",               (1,),   int),
                       ("clock-info-mem",           "nvmlDeviceGetClockInfo",               (2,),   int),
                       ("clock-info-graphics-max",  "nvmlDeviceGetMaxClockInfo",            (0,),   int),
                       ("clock-info-sm-max",        "nvmlDeviceGetMaxClockInfo",            (1,),   int),
                       ("clock-info-mem-max",       "nvmlDeviceGetMaxClockInfo",            (2,),   int),
                       ("fan-speed",                "nvmlDeviceGetFanSpeed",                (),     int),
                       ("temperature",              "nvmlDeviceGetTemperature",             (0,),   int),
                       ("power-state",              "nvmlDeviceGetPowerState",              (),     int),
                       ("vbios-version",            "nvmlDeviceGetVbiosVersion",            (),     str),
                       ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e)
                        continue
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Example #29
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory /
                                                  1024 / 1024)
                process['pid'] = nv_process.pid
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except:
                power_limit = None

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None  # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process.pid)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                int(power / 1000) if power is not None else None,
                'enforced.power.limit':
                int(power_limit / 1000) if power is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                int(memory.used / 1024 / 1024) if memory else None,
                'memory.total':
                int(memory.total / 1024 / 1024) if memory else None,
                'processes':
                processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list)
Example #30
0
def base_structure():
    # data read
    x_train=np.load(r'./data_split/x_train.npy')
    x_test=np.load(r'./data_split/x_test.npy')
    x_validation=np.load(r'./data_split/x_validation.npy')
    y_train=np.load(r'./data_split/y_train.npy')
    y_test=np.load(r'./data_split/y_test.npy')
    y_validation=np.load(r'./data_split/y_validation.npy')
    
    #data standard normalization
    a,b,c=x_train.shape
    x_train=x_train.reshape(a*b,c)
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train=scaler.transform(x_train)
    x_train=x_train.reshape(a,b,c)

    a,b,c=x_validation.shape
    x_validation=x_validation.reshape(a*b,c)
    x_validation=scaler.transform(x_validation)
    x_validation=x_validation.reshape(a,b,c)

    a,b,c=x_test.shape
    x_test=x_test.reshape(a*b,c)
    x_test=scaler.transform(x_test)
    x_test=x_test.reshape(a,b,c)

    x1=torch.from_numpy(x_train).float()
    y1=torch.from_numpy(y_train).float()
    x2=torch.from_numpy(x_validation).float()
    y2=torch.from_numpy(y_validation).float()
    x3=torch.from_numpy(x_test).float()
    y3=torch.from_numpy(y_test).float()
    
    #data from.npy to pytorch data

    global BATCH_SIZE
    BATCH_SIZE=512


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = Data.TensorDataset(x1,y1)

    trainloader = Data.DataLoader(
        dataset=train_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=False, # 要不要打乱数据 (打乱比较好)
        num_workers=2,              # 多线程来读数据
        drop_last=True,
    )



    vali_dataset = Data.TensorDataset(x2,y2)

    valiloader = Data.DataLoader(
        dataset=vali_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=False, # 要不要打乱数据 (打乱比较好)
        num_workers=2,              # 多线程来读数据
        drop_last=True,
    )


    test_dataset = Data.TensorDataset(x3,y3)

    testloader = Data.DataLoader(
        dataset=test_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=False, # 要不要打乱数据 (打乱比较好)
        num_workers=2,              # 多线程来读数据
        drop_last=True,
    )
    
#encoder str

    class Encoder(nn.Module):
        def __init__(self, input_dim,emb_dim, hid_dim, n_layers, dropout=0.1):
            super().__init__()

            self.input_dim = input_dim
            self.emb_dim = emb_dim
            self.hid_dim = hid_dim
            self.n_layers = n_layers
            self.dropout = dropout

            self.embedding = nn.Linear(input_dim, emb_dim)

            self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

            self.dropout = nn.Dropout(dropout)

        def forward(self, x):

            #x = [len, batch size,inout_size]

            embedded = self.dropout(self.embedding(x))

            #embedded = [len, batch size, emb dim]

            outputs, (hidden, cell) = self.rnn(embedded)

            #outputs = [src sent len, batch size, hid dim * n directions]
            #hidden = [n layers * n directions, batch size, hid dim]
            #cell = [n layers * n directions, batch size, hid dim]

            #outputs are always from the top hidden layer

            return hidden, cell
# decoder str

    class Decoder(nn.Module):
        def __init__(self, decoder_input_dim,emb_dim, hid_dim, n_layers, dropout=0.2):
            super().__init__()

            self.emb_dim = emb_dim
            self.hid_dim = hid_dim
            self.decoder_input_dim = decoder_input_dim
            self.n_layers = n_layers
            self.dropout = dropout

            self.embedding = nn.Linear(decoder_input_dim, emb_dim)

            self.rnn = nn.LSTM(emb_dim+hid_dim, hid_dim, n_layers, dropout = dropout)

            self.out = nn.Linear(hid_dim, decoder_input_dim)

            self.dropout = nn.Dropout(dropout)

        def forward(self, input,context, hidden, cell):

            
            input = input.unsqueeze(0)
            embedded = self.dropout(self.embedding(input))

            #input = [1, batch size]
    #         print('inputshape:',input.shape)
            emb_con = torch.cat((embedded, context), dim = 2)

            ##embedded = self.dropout(self.embedding(input))

            #embedded = [1, batch size, emb dim]

            output, (hidden, cell) = self.rnn(emb_con, (hidden, cell))

            #output = [len, batch size, hid dim * n directions]
            #hidden = [n layers * n directions, batch size, hid dim]
            #cell = [n layers * n directions, batch size, hid dim]

            #sent len and n directions will always be 1 in the decoder, therefore:
            #output = [1, batch size, hid dim]
            #hidden = [n layers, batch size, hid dim]
            #cell = [n layers, batch size, hid dim]

            prediction = self.out(output.squeeze(0))

            #prediction = [batch size, output dim]

            return prediction, hidden, cell

        
    class Seq2Seq(nn.Module):
        global firstinput
        def __init__(self, encoder, decoder, device):
            super().__init__()

            self.encoder = encoder

            self.decoder = decoder
            self.device = device

            assert encoder.hid_dim == decoder.hid_dim,             "Hidden dimensions of encoder and decoder must be equal!"
            assert encoder.n_layers == decoder.n_layers,             "Encoder and decoder must have equal number of layers!"

        def forward(self, x, y, teacher_forcing_ratio = 0.5):

            #src = [src sent len, batch size]
            #trg = [trg sent len, batch size]
            #teacher_forcing_ratio is probability to use teacher forcing
            #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

            batch_size = BATCH_SIZE
            max_len = 25
            trg_vocab_size = 2

            #tensor to store decoder outputs
            outputs = torch.zeros(max_len, batch_size, trg_vocab_size)

            #last hidden state of the encoder is used as the initial hidden state of the decoder
            hidden, cell = self.encoder(x)
            context=cell[1,:,:]
            context=context.unsqueeze(0)
    #         print('c-shape:',context.shape)
            #first input to the decoder is the <sos> tokens
            input=firstinput
            #print(input.size())
    #         input = input.unsqueeze(0)
            #print(input.size())
            for t in range(max_len):

                output, hidden, cell = self.decoder(input, context,hidden, cell)
                outputs[t] = output
                #print(output)
                #input = output.unsqueeze(0)
                #print(input.size())
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = output
                if t==24:
                    break
                input = ((y[t,:,:]) if teacher_force else top1)
                #outputs[t] = output
                #print('output',output.size())
                #input = output.unsqueeze(0)


            return outputs
        
    INPUT_DIM =36
    ENCODER_INPUT_DIM = 2
    HID_DIM = 128
    N_LAYERS = 2
    ENC_EMB_DIM = 64
    DEC_EMB_DIM = 16

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM , HID_DIM, N_LAYERS)
    dec = Decoder(ENCODER_INPUT_DIM,DEC_EMB_DIM , HID_DIM, N_LAYERS)

    model = Seq2Seq(enc, dec, device).to(device)


    # In[ ]:


    def init_weights(m):
        for name, param in m.named_parameters():
            nn.init.uniform_(param.data, -0.15, 0.15)
    #         nn.init.orthogonal_(param.data)

    model.apply(init_weights)


    # In[ ]:


    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f'The model has {count_parameters(model):,} trainable parameters')


    # In[ ]:


    optimizer = optim.Adam(model.parameters(),weight_decay=0.00001,lr=0.01)
    criterion = nn.MSELoss()
    
    def train(model, dataloader,optimizer, criterion, clip):
        global firstinput
        model.train()

        epoch_loss = 0

        for x,y in dataloader:

            x=x.transpose(1,0)
            y=y.transpose(1,0)
            x=x.to('cuda')
            y=y.to('cuda')
            firstinput=y[0,:,:]
            y=y[1:,:,:]
            optimizer.zero_grad()

            output = model(x, y)
            output = output.to('cuda')


    #         loss = criterion(output, y)
            #print(output.size())
            loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0])
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()

            epoch_loss += loss.item()
            #print(epoch_loss)

        return epoch_loss/len(dataloader)


    # In[ ]:


    def evaluate(model, validataloader, criterion):

        model.eval()

        epoch_loss = 0

        with torch.no_grad():

            for x,y in validataloader:

                x=x.transpose(1,0)
                y=y.transpose(1,0)
                x=x.to('cuda')
                y=y.to('cuda')
                firstinput=y[0,:,:]
                y=y[1:,:,:]
                optimizer.zero_grad()

                output = model(x, y, 0) #turn off teacher forcing
                output = output.to('cuda')


                loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0])
                epoch_loss += loss.item()


        return epoch_loss / len(validataloader)


    # In[ ]:


    def test(model, testdataloader, criterion):
        global j
        global firstinput
        global test_result
        model.eval()

        epoch_loss = 0

        with torch.no_grad():

            for x,y in testdataloader:

                x=x.transpose(1,0)
                y=y.transpose(1,0)
                x=x.to('cuda')
                y=y.to('cuda')
                firstinput=y[0,:,:]
                y=y[1:,:,:]
                optimizer.zero_grad()

                output = model(x, y, 0) #turn off teacher forcing
                test_result[:,j:j+BATCH_SIZE,:]=output
                j=j+BATCH_SIZE
                output = output.to('cuda')


    #             loss = criterion(output, y)
                loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0])
                epoch_loss += loss.item()

    #     print(len(testdataloader))

        return epoch_loss / len(testdataloader)


    # In[ ]:


    N_EPOCHS = 40
    CLIP = 1
    global test_result
    test_result=np.zeros([25,80000,2])
    pynvml.nvmlInit()
    handle=pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle)
    print('this is base-line\n')
    for epoch in range(N_EPOCHS):
        global j
        j=0
        start_time = time.process_time()
        train_loss = train(model, trainloader, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, valiloader, criterion)
        end_time = time.process_time()
        print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time}s')
        print(f'\tTrain Loss: {train_loss:.3f} |  Val. Loss: {valid_loss:.3f}')
        #writer.add_scalars('loss',{'train_loss': train_loss,
                                   #'valid_loss': valid_loss},epoch )
        test_loss = test(model, testloader, criterion)
        if test_loss<3.9:
            print('testloss:',test_loss)
            test_result=test_result[:,:j,:]
            np.save(r'./result/base_predict_tra.npy',test_result)
            np.save(r'./result/true_tra.npy',y_test[:,1:,:])
            break
        if epoch == 39:
            print('testloss:',test_loss)
            test_result=test_result[:,:j,:]
            np.save(r'./result/base_predict_tra.npy',test_result)
            np.save(r'./result/true_tra.npy',y_test[:,1:,:])
            break
    print('meminfo.used:',meminfo.used/(1024*1024))
    print('meminfo.total:',meminfo.total/(1024*1024))
    
    return 0
Example #31
0
	def _get_data(self):
		data = {}

		if self.deviceCount:
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				name = pynvml.nvmlDeviceGetName(handle)
				brand = pynvml.nvmlDeviceGetBrand(handle)
				brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan']

				### Get data ###
				## Memory usage
				try:
					mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
				except Exception as e:
					self.debug(str(e))
					mem = None

				## ECC errors
				try:
					_memError = {}
					_eccCounter = {}
					eccErrors = {}
					eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
					memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED']
					memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY']
					for memoryLocation in range(5):
						for eccCounter in range(2):
							for memError in range(2):
								_memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation)
							_eccCounter[eccCounterType[eccCounter]] = _memError
						eccErrors[memoryLocationType[memoryLocation]] = _eccCounter
				except Exception as e:
					self.debug(str(e))
					eccErrors = None

				## Temperature
				try:
					temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU)
				except Exception as e:
					self.debug(str(e))
					temp = None

				## Fan
				try:
					fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
				except Exception as e:
					self.debug(str(e))
					fanspeed = None

				## GPU and Memory Utilization
				try:
					util = pynvml.nvmlDeviceGetUtilizationRates(handle)
					gpu_util = util.gpu
					mem_util = util.memory
				except Exception as e:
					self.debug(str(e))
					gpu_util = None
					mem_util = None

				## Encoder Utilization
				try:
					encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
					enc_util = encoder[0]
				except Exception as e:
					self.debug(str(e))
					enc_util = None

				## Decoder Utilization
				try:
					decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
					dec_util = decoder[0]
				except Exception as e:
					self.debug(str(e))
					dec_util = None

				## Clock frequencies
				try:
					clock_core = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
					clock_sm = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM)
					clock_mem = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
				except Exception as e:
					self.debug(str(e))
					clock_core = None
					clock_sm = None
					clock_mem = None

				### Packing data ###
				self.debug("Device", gpuIdx, ":", str(name))
				data["device_name_" + gpuIdx] = name

				self.debug("Brand:", str(brands[brand]))

				self.debug(str(name), "Temp      :", str(temp))
				data["device_temp_" + gpuIdx] = temp

				self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
				data["device_mem_total_" + gpuIdx] = mem.total

				self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
				data["device_mem_used_" + gpuIdx] = mem.used

				self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
				data["device_mem_free_" + gpuIdx] = mem.free

				self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
				data["device_load_gpu_" + gpuIdx] = gpu_util

				self.debug(str(name), "Load MEM  :", str(mem_util), '%')
				data["device_load_mem_" + gpuIdx] = mem_util

				self.debug(str(name), "Load ENC  :", str(enc_util), '%')
				data["device_load_enc_" + gpuIdx] = enc_util

				self.debug(str(name), "Load DEC  :", str(dec_util), '%')
				data["device_load_dec_" + gpuIdx] = dec_util

				self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
				data["device_core_clock_" + gpuIdx] = clock_core

				self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
				data["device_sm_clock_" + gpuIdx] = clock_sm

				self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
				data["device_mem_clock_" + gpuIdx] = clock_mem

				self.debug(str(name), "Fan speed :", str(fanspeed), '%')
				data["device_fanspeed_" + gpuIdx] = fanspeed

				self.debug(str(name), "ECC errors:", str(eccErrors))
				if eccErrors is not None:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
					data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
				else:
					data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None

		## Get unit (S-class Nvidia cards) data
		if self.unitCount:
			for i in range(self.unitCount):
				gpuIdx = str(i)
				handle = pynvml.nvmlUnitGetHandleByIndex(i)

				try:
					fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
					fan_speed = fan.speed  # Fan speed (RPM)
					fan_state = fan.state  # Flag that indicates whether fan is working properly
				except Exception as e:
					self.debug(str(e))
					fan_speed = None
					fan_state = None

				try:
					psu = pynvml.nvmlUnitGetPsuInfo(handle)
					psu_current = psu.current  # PSU current (A)
					psu_power = psu.power  # PSU power draw (W)
					psu_state = psu.state  # The power supply state
					psu_voltage = psu.voltage  # PSU voltage (V)
				except Exception as e:
					self.debug(str(e))
					psu_current = None
					psu_power = None
					psu_state = None
					psu_voltage = None

				try:
					temp_intake = pynvml.nvmlUnitGetTemperature(handle,0)  # Temperature at intake in C
					temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1)  # Temperature at exhaust in C
					temp_board = pynvml.nvmlUnitGetTemperature(handle,2)  # Temperature on board in C
				except Exception as e:
					self.debug(str(e))
					temp_intake = None
					temp_exhaust = None
					temp_board = None

				self.debug('Unit fan speed:',str(fan_speed))
				data["unit_fan_speed_" + gpuIdx] = fan_speed

				self.debug('Unit fan state:',str(fan_state))
				data["unit_fan_state_" + gpuIdx] = fan_state

				self.debug('Unit PSU current:',str(psu_current))
				data["unit_psu_current_" + gpuIdx] = psu_current

				self.debug('Unit PSU power:', str(psu_power))
				data["unit_psu_power_" + gpuIdx] = psu_power

				self.debug('Unit PSU state:', str(psu_state))
				data["unit_psu_state_" + gpuIdx] = psu_state

				self.debug('Unit PSU voltage:', str(psu_voltage))
				data["unit_psu_voltage_" + gpuIdx] = psu_voltage

				self.debug('Unit temp intake:', str(temp_intake))
				data["unit_temp_intake_" + gpuIdx] = temp_intake

				self.debug('Unit temp exhaust:', str(temp_exhaust))
				data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

				self.debug('Unit temp board:', str(temp_board))
				data["unit_temp_board_" + gpuIdx] = temp_board

		## Get data via legacy mode
		if self.legacy:
			try:
				output, error = Popen(
					[
						"nvidia-settings",
						"-c", ":0",
						"-q", "GPUUtilization",
						"-q", "GPUCurrentClockFreqs",
						"-q", "GPUCoreTemp",
						"-q", "TotalDedicatedGPUMemory",
						"-q", "UsedDedicatedGPUMemory"
					],
					shell=False,
					stdout=PIPE,stderr=PIPE).communicate()
				output = repr(str(output))
				if len(output) < 800:
					raise Exception('Error in fetching data from nvidia-settings ' + output)
				self.debug(str(error), output)
			except Exception as e:
				self.error(str(e))
				self.error('Setting legacy mode to False')
				self.legacy = False
				return data
			for i in range(self.deviceCount):
				gpuIdx = str(i)
				if data["device_temp_" + gpuIdx] is None:
					coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_temp_" + gpuIdx] = int(coreTemp)
						self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp))
					except Exception as e:
						self.debug(str(e), "skipping device_temp_" + gpuIdx)
				if data["device_mem_used_" + gpuIdx] is None:
					memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1]
					try:
						data["device_mem_used_" + gpuIdx] = int(memUsed)
						self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed))
					except Exception as e:
						self.debug(str(e), "skipping device_mem_used_" + gpuIdx)
				if data["device_load_gpu_" + gpuIdx] is None:
					gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1]
					try:
						data["device_load_gpu_" + gpuIdx] = int(gpu_util)
						self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util))
					except Exception as e:
						self.debug(str(e), "skipping device_load_gpu_" + gpuIdx)
				if data["device_load_mem_" + gpuIdx] is None:
					mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2]
					try:
						data["device_load_mem_" + gpuIdx] = int(mem_util)
						self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util))
					except Exception as e:
						self.debug(str(e), "skipping device_load_mem_" + gpuIdx)
				if data["device_core_clock_" + gpuIdx] is None:
					clock_core = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1]
					try:
						data["device_core_clock_" + gpuIdx] = int(clock_core)
						self.debug('Using legacy core_clock for GPU {0}: {1}'.format(gpuIdx, clock_core))
					except Exception as e:
						self.debug(str(e), "skipping device_core_clock_" + gpuIdx)
				if data["device_mem_clock_" + gpuIdx] is None:
					clock_mem = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2]
					try:
						data["device_mem_clock_" + gpuIdx] = int(clock_mem)
						self.debug('Using legacy mem_clock for GPU {0}: {1}'.format(gpuIdx, clock_mem))
					except Exception as e:
						self.debug(str(e), "skipping device_mem_clock_" + gpuIdx)

		return data
Example #32
0
import pynvml
import numpy as np
import os
import time
pynvml.nvmlInit()
# 这里的0是GPU id
handle0 = pynvml.nvmlDeviceGetHandleByIndex(0)
handle1 = pynvml.nvmlDeviceGetHandleByIndex(1)
handle2 = pynvml.nvmlDeviceGetHandleByIndex(2)
handle3 = pynvml.nvmlDeviceGetHandleByIndex(3)

memInfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0)
memInfo1 = pynvml.nvmlDeviceGetMemoryInfo(handle1)
memInfo2 = pynvml.nvmlDeviceGetMemoryInfo(handle2)
memInfo3 = pynvml.nvmlDeviceGetMemoryInfo(handle3)

commandList = ['',
               '',
               '',]
commandFalg = np.ones(len(commandList))

def getUsedRate(memInfo):
    return memInfo.used / memInfo.total


def sendCommand(deviceID):
    print(os.system('python train.py --epochs 1002 --devices 0'))
    print(str(deviceID) + ': command')
    exit()

Example #33
0
def _get_device_info(device_id):
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)

    return mem_info
Example #34
0
 def init(self):
     
     self.util_history = []
     self.temp_history = []
     pynvml.nvmlInit()
     self.gpu_handles = []
     self.deviceCount = pynvml.nvmlDeviceGetCount()
     
     for i in range(self.deviceCount):
         self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
     
     self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6)
     self.cpu_prog_bars = []
     self.gpu_boxes = []
     self.gpu_prog_bars = []
     
     self.prev_idle = []
     self.prev_total = []
     self.idle = []
     self.total = []
     
     #---cpu_box---
     try:
         stat = open("/proc/stat")
         
         statlines = stat.read().splitlines()
         stat.close()
         
         self.corecount = -1
         
         for line in statlines:
             if (line[0:2] == "cp"):
                 self.corecount+= 1
             else:
                 break
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True))
         self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0)
         
         self.prev_idle.append(0)
         self.prev_total.append(0)
         self.idle.append(0)
         self.total.append(0)
     
     #---gpu_boxes---
     for i in range(self.deviceCount):
         product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i])
         product_name = product_name.decode('utf-8')
         
         gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
         
         label = Gtk.Label(product_name)
         
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True))
         
         gpu_box.pack_start(label, True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0)
         
         self.gpu_boxes.append(gpu_box)
     
     #---proc---
     proc_liststore = Gtk.ListStore(int, str, int)
     
     self.tree = Gtk.TreeView(model=proc_liststore)
     
     renderer_pid = Gtk.CellRendererText()
     column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0)
     column_pid.set_resizable(True)
     self.tree.append_column(column_pid)
     
     renderer_path = Gtk.CellRendererText()
     column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1)
     column_path.set_resizable(True)
     column_path.set_fixed_width(250)
     self.tree.append_column(column_path)
     
     renderer_mem = Gtk.CellRendererText()
     column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2)
     column_mem.set_resizable(True)
     self.tree.append_column(column_mem)
Example #35
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder',
                           long(util_encoder[0]),
                           tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder',
                           long(util_decoder[0]),
                           tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Example #36
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            if not wrap_nvml_init(nvmlInit):
                return devices
            deviceCount = nvmlDeviceGetCount()
            log("identify_cards() will probe %i cards", deviceCount)
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                log("identify_cards() handle(%i)=%s", i, handle)
                props = {}

                def meminfo(memory):
                    return {
                        "total": int(memory.total),
                        "free": int(memory.free),
                        "used": int(memory.used),
                    }

                def pciinfo(pci):
                    i = {}
                    for nvname, pubname in {
                            "domain": "domain",
                            "bus": "bus",
                            "device": "device",
                            "pciDeviceId": "pci-device-id",
                            "pciSubSystemId": "pci-subsystem-id",
                    }.items():
                        try:
                            i[pubname] = int(getattr(pci, nvname))
                        except (ValueError, AttributeError):
                            pass
                    try:
                        i["bus-id"] = bytestostr(pci.busId)
                    except AttributeError:
                        pass
                    return i

                for prefix, prop, fn_name, args, conv in (
                    ("", "name", "nvmlDeviceGetName", (), strtobytes),
                    ("", "serial", "nvmlDeviceGetSerial", (), strtobytes),
                    ("", "uuid", "nvmlDeviceGetUUID", (), strtobytes),
                    ("", "pci", "nvmlDeviceGetPciInfo", (), pciinfo),
                    ("", "memory", "nvmlDeviceGetMemoryInfo", (), meminfo),
                    ("pcie-link", "generation-max",
                     "nvmlDeviceGetMaxPcieLinkGeneration", (), int),
                    ("pcie-link", "width-max", "nvmlDeviceGetMaxPcieLinkWidth",
                     (), int),
                    ("pcie-link", "generation",
                     "nvmlDeviceGetCurrPcieLinkGeneration", (), int),
                    ("pcie-link", "width", "nvmlDeviceGetCurrPcieLinkWidth",
                     (), int),
                    ("clock-info", "graphics", "nvmlDeviceGetClockInfo", (0, ),
                     int),
                    ("clock-info", "sm", "nvmlDeviceGetClockInfo", (1, ), int),
                    ("clock-info", "mem", "nvmlDeviceGetClockInfo", (2, ),
                     int),
                    ("clock-info", "graphics-max", "nvmlDeviceGetMaxClockInfo",
                     (0, ), int),
                    ("clock-info", "sm-max", "nvmlDeviceGetMaxClockInfo",
                     (1, ), int),
                    ("clock-info", "mem-max", "nvmlDeviceGetMaxClockInfo",
                     (2, ), int),
                    ("", "fan-speed", "nvmlDeviceGetFanSpeed", (), int),
                    ("", "temperature", "nvmlDeviceGetTemperature", (0, ),
                     int),
                    ("", "power-state", "nvmlDeviceGetPowerState", (), int),
                    ("", "vbios-version", "nvmlDeviceGetVbiosVersion", (),
                     strtobytes),
                ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        if prefix:
                            d = props.setdefault(prefix, {})
                        else:
                            d = props
                        d[prop] = v
                    except Exception as e:
                        log(
                            "identify_cards() cannot query %s using %s on device %i with handle %s: %s",
                            prop, fn, i, handle, e)
                        continue
                log("identify_cards() [%i]=%s", i, props)
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards using NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Example #37
0
 def get_handles(self):
     """ Return all listed Nvidia handles """
     self.handles = [pynvml.nvmlDeviceGetHandleByIndex(i)
                     for i in range(self.device_count)]
Example #38
0
 def __init__(self, device_idx):
     super().__init__()
     self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
Example #39
0
def get_infos():
    """Get all information about all your graphics cards.

    Returns:
        dict: The returned result is a dict with 3 keys: count, driver_version and devices:
            count: Number of gpus found
            driver_version: The version of the system’s graphics driver
            devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. 
                     It should be noted that the Process field is also a namedtuple which has 11 fields.
    """

    infos = {}
    Device = namedtuple(
        "Device",
        [
            "id",
            "name",
            "free",
            "used",
            "total",
            "temperature",
            "fan_speed",
            "power_usage",
            "power_state",
            "process",
        ],
    )
    Process = namedtuple(
        "Process",
        [
            "pid",
            "memory_percent",
            "status",
            "username",
            "num_threads",
            "cpu_num",
            "cpu_percent",
            "name",
            "cmdline",
            "used_gpu_mem",
            "create_time",
        ],
    )
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        power_usage = pynvml.nvmlDeviceGetPowerUsage(
            handle)  # Power usage in milliwatts mW
        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
            handle)  # Which processes are using the GPU
        # process_info = [(item.pid, item.usedGpuMemory) for item in process_info]
        process_info = []
        for p in processes:
            # append Process object to process_info
            pid = p.pid
            used_gpu_mem = p.usedGpuMemory
            p = psutil.Process(pid=pid)
            _ = p.cpu_percent()
            time.sleep(0.05)
            process_info.append(
                Process(
                    pid=pid,
                    memory_percent=p.memory_percent(),
                    status=p.status(),
                    username=p.username(),
                    num_threads=p.num_threads(),
                    cpu_num=p.cpu_num(),
                    cpu_percent=p.cpu_percent(),
                    name=p.name(),
                    cmdline=" ".join(p.cmdline()),
                    used_gpu_mem=used_gpu_mem,
                    create_time=p.create_time(),
                ))
        try:
            fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported as e:
            fan_speed = None
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_state = pynvml.nvmlDeviceGetPowerState(handle)
        temperature = pynvml.nvmlDeviceGetTemperature(
            handle, pynvml.NVML_TEMPERATURE_GPU)
        devices.append(
            Device(
                id=i,
                name=name,
                free=mem_info.free,
                used=mem_info.used,
                total=mem_info.total,
                temperature=temperature,
                fan_speed=fan_speed,
                power_usage=power_usage,
                power_state=power_state,
                process=process_info,
            ))

    infos["count"] = device_count
    infos["driver_version"] = driver_version
    infos["devices"] = devices
    return infos
Example #40
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                props = {}

                def meminfo(memory):
                    return {
                        "total": int(memory.total),
                        "free": int(memory.free),
                        "used": int(memory.used),
                    }

                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId",
                              "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i

                for prop, fn_name, args, conv in (
                    ("name", "nvmlDeviceGetName", (), str),
                    ("serial", "nvmlDeviceGetSerial", (), str),
                    ("uuid", "nvmlDeviceGetUUID", (), str),
                    ("pci", "nvmlDeviceGetPciInfo", (), pciinfo),
                    ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo),
                    ("pcie-link-generation-max",
                     "nvmlDeviceGetMaxPcieLinkGeneration", (), int),
                    ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth",
                     (), int),
                    ("pcie-link-generation",
                     "nvmlDeviceGetCurrPcieLinkGeneration", (), int),
                    ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (),
                     int),
                    ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0, ),
                     int),
                    ("clock-info-sm", "nvmlDeviceGetClockInfo", (1, ), int),
                    ("clock-info-mem", "nvmlDeviceGetClockInfo", (2, ), int),
                    ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo",
                     (0, ), int),
                    ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1, ),
                     int),
                    ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2, ),
                     int),
                    ("fan-speed", "nvmlDeviceGetFanSpeed", (), int),
                    ("temperature", "nvmlDeviceGetTemperature", (0, ), int),
                    ("power-state", "nvmlDeviceGetPowerState", (), int),
                    ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str),
                ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log(
                            "identify_cards() cannot query %s using %s on device %i with handle %s: %s",
                            prop, fn, i, handle, e)
                        continue
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Example #41
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Example #42
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import os
from datetime import datetime
import torch
import numpy as np
import config as cfg
import util.PointNetVlad as PNV
import pynvml
from dateutil import tz

pynvml.nvmlInit()
handle0 = pynvml.nvmlDeviceGetHandleByIndex(0)
if torch.cuda.device_count() > 1:
    handle1 = pynvml.nvmlDeviceGetHandleByIndex(1)
ratio = 1024 ** 2

def print_gpu(s=""):
    if torch.cuda.device_count() > 1:
        meminfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0)
        meminfo1 = pynvml.nvmlDeviceGetMemoryInfo(handle0)
        used = (meminfo0.used + meminfo1.used) / ratio
    else:
        meminfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0)
        used = meminfo0.used / ratio
    print(s+" used: ", used)

parser = argparse.ArgumentParser()
parser.add_argument('--results_dir', default='results/',
                    help='results dir [default: results]')
Example #43
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode('utf-8')    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = (nv_process.usedGpuMemory // MB if
                           nv_process.usedGpuMemory else None)
                process['gpu_memory_usage'] = usedmem
                # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem)
                process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = "%d MiB" % (
                #     round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['cpu_memory_usage'] = (
                    round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    try:
                        process['cpu_percent'] = cache_process.cpu_percent()
                    except psutil.NoSuchProcess:
                        process['cpu_percent'] = 0.0
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        process['cpu_percent'] = 0.0
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else 0,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else 0,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else 0,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else 0,
                'memory.total': memory.total // MB if memory else 0,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Example #44
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode('utf-8')  # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None  # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Example #45
0
def run(args):
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)
    args.model_save_path = os.path.join(args.model_save_dir,\
                                        f'{args.modelName}-{args.datasetName}-{args.train_mode}.pth')
    # indicate used gpu
    if len(args.gpu_ids) == 0 and torch.cuda.is_available():
        # load free-most gpu
        pynvml.nvmlInit()
        dst_gpu_id, min_mem_used = 0, 1e16
        for g_id in [0, 1, 2, 3]:
            handle = pynvml.nvmlDeviceGetHandleByIndex(g_id)
            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            mem_used = meminfo.used
            if mem_used < min_mem_used:
                min_mem_used = mem_used
                dst_gpu_id = g_id
        print(f'Find gpu: {dst_gpu_id}, use memory: {min_mem_used}!')
        logger.info(f'Find gpu: {dst_gpu_id}, with memory: {min_mem_used} left!')
        args.gpu_ids.append(dst_gpu_id)
    # device
    using_cuda = len(args.gpu_ids) > 0 and torch.cuda.is_available()
    logger.info("Let's use %d GPUs!" % len(args.gpu_ids))
    device = torch.device('cuda:%d' % int(args.gpu_ids[0]) if using_cuda else 'cpu')
    args.device = device
    # add tmp tensor to increase the temporary consumption of GPU
    tmp_tensor = torch.zeros((100, 100)).to(args.device)
    # load data and models
    dataloader = MMDataLoader(args)
    model = AMIO(args).to(device)

    del tmp_tensor

    def count_parameters(model):
        answer = 0
        for p in model.parameters():
            if p.requires_grad:
                answer += p.numel()
                # print(p)
        return answer
    logger.info(f'The model has {count_parameters(model)} trainable parameters')
    # exit()
    # using multiple gpus
    # if using_cuda and len(args.gpu_ids) > 1:
    #     model = torch.nn.DataParallel(model,
    #                                   device_ids=args.gpu_ids,
    #                                   output_device=args.gpu_ids[0])
    atio = ATIO().getTrain(args)
    # do train
    atio.do_train(model, dataloader)
    # load pretrained model
    assert os.path.exists(args.model_save_path)
    model.load_state_dict(torch.load(args.model_save_path))
    model.to(device)
    # do test
    if args.is_tune:
        # using valid dataset to tune hyper parameters
        results = atio.do_test(model, dataloader['valid'], mode="VALID")
    else:
        results = atio.do_test(model, dataloader['test'], mode="TEST")

    del model
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(5)
 
    return results
Example #46
0
def get_device_handles():
    """Get a list of NVML device handles, one per device.

    Can throw NVMLError.
    """
    return [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(pynvml.nvmlDeviceGetCount())]
Example #47
0
    def stats(self):
        stats = {}
        for i in range(0, self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = util.gpu
                stats["gpu.{}.{}".format(i, "memory")] = util.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu
                    stats["gpu.process.{}.{}".format(i, "memory")] = util.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
                    power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(i, "powerPercent")] = power_usage

                except pynvml.NVMLError as err:
                    pass

            except pynvml.NVMLError as err:
                pass
        if psutil:
            net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
            stats["network"] = {
                "sent": net.bytes_sent - self.network_init["sent"],
                "recv": net.bytes_recv - self.network_init["recv"]
            }
            # TODO: maybe show other partitions, will likely need user to configure
            stats["disk"] = psutil.disk_usage('/').percent
            stats["proc.memory.availableMB"] = sysmem.available / 1048576.0
            try:
                stats["proc.memory.rssMB"] = self.proc.memory_info().rss / \
                    1048576.0
                stats["proc.memory.percent"] = self.proc.memory_percent()
                stats["proc.cpu.threads"] = self.proc.num_threads()
            except psutil.NoSuchProcess:
                pass
        return stats
Example #48
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Example #49
-1
    def _summary(self):
        summary = []
        summary.append("GPU running Processes:")
        initGPU()
        try:
            gpusToUse = [int(n) for n in (self.gpusToUse.get()).split()]
            for i in gpusToUse:
                handle = nvmlDeviceGetHandleByIndex(i)
                cps = nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    # p_tags['pid'] = ps.pid
                    msg = " %d) " % i + psutil.Process(ps.pid).name()
                    msg += " (mem =%.2f MB)" % (float(ps.usedGpuMemory) /
                                                1048576.)
                    summary.append(msg)
        except NVMLError as err:
                summary.append(str(err))

        return summary