def test_record_memory(self):
        # One measurement every 0.01 seconds
        frequency = 0.01
        monitoring_time = 10
        metrics = [GPUUsedMemory, GPUFreeMemory]
        gpus = ['all']
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device(), GPUDevice)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        with self.assertRaises(TritonModelAnalyzerException):
            dcgm_monitor.stop_recording_metrics()

        dcgm_monitor.destroy()

        metrics = ['UndefinedTag']
        with self.assertRaises(TritonModelAnalyzerException):
            DCGMMonitor(gpus, frequency, metrics)
    def test_gpu_id(self):
        frequency = 0.01
        metrics = [GPUUsedMemory, GPUFreeMemory]
        gpus = ['UndefinedId']
        with self.assertRaises(TritonModelAnalyzerException):
            DCGMMonitor(gpus, frequency, metrics)

        gpus = [str(TEST_UUID, encoding='ascii')]
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.destroy()
    def test_record_utilization(self):
        # One measurement every 0.01 seconds
        frequency = 0.01
        monitoring_time = 10
        metrics = [GPUUtilization]
        gpus = ['all']
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device(), GPUDevice)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() <= 100)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        dcgm_monitor.destroy()
Beispiel #4
0
 def test_immediate_start_stop(self):
     frequency = 1
     metrics = [GPUUsedMemory, GPUFreeMemory]
     dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics)
     dcgm_monitor.start_recording_metrics()
     dcgm_monitor.stop_recording_metrics()
     dcgm_monitor.destroy()
    def _start_monitors(self, cpu_only=False):
        """
        Start any metrics monitors
        """

        if not cpu_only:
            try:
                if self._config.use_local_gpu_monitor:
                    self._gpu_monitor = DCGMMonitor(
                        self._gpus, self._config.monitoring_interval,
                        self._gpu_metrics)
                    self._check_triton_and_model_analyzer_gpus()
                else:
                    self._gpu_monitor = RemoteMonitor(
                        self._config.triton_metrics_url,
                        self._config.monitoring_interval, self._gpu_metrics)
                self._gpu_monitor.start_recording_metrics()
            except TritonModelAnalyzerException:
                self._destroy_monitors()
                raise

        self._cpu_monitor = CPUMonitor(self._server,
                                       self._config.monitoring_interval,
                                       self._cpu_metrics)
        self._cpu_monitor.start_recording_metrics()
Beispiel #6
0
 def test_immediate_start_stop(self):
     frequency = 0.01
     tags = [GPUUsedMemory, GPUFreeMemory]
     gpus = ['all']
     dcgm_monitor = DCGMMonitor(gpus, frequency, tags)
     dcgm_monitor.start_recording_metrics()
     dcgm_monitor.stop_recording_metrics()
     dcgm_monitor.destroy()
    def _start_monitors(self):
        """
        Start any metrics monitors
        """

        self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval,
                                         self._dcgm_metrics)
        self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval,
                                       self._cpu_metrics)

        self._dcgm_monitor.start_recording_metrics()
        self._cpu_monitor.start_recording_metrics()
Beispiel #8
0
    def test_record_power(self):
        # One measurement every 0.01 seconds
        frequency = 1
        monitoring_time = 2
        metrics = [GPUPowerUsage]
        dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device_uuid(), str)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        dcgm_monitor.destroy()