def test_record_utilization(self): # One measurement every 0.01 seconds frequency = 0.01 monitoring_time = 10 metrics = [GPUUtilization] gpus = ['all'] dcgm_monitor = DCGMMonitor(gpus, frequency, metrics) dcgm_monitor.start_recording_metrics() time.sleep(monitoring_time) records = dcgm_monitor.stop_recording_metrics() # Assert instance types for record in records: self.assertIsInstance(record.device(), GPUDevice) self.assertIsInstance(record.value(), float) self.assertTrue(record.value() <= 100) self.assertTrue(record.value() == TEST_RECORD_VALUE) self.assertIsInstance(record.timestamp(), int) # The number of records should be dividable by number of metrics self.assertTrue(len(records) % len(metrics) == 0) self.assertTrue(len(records) > 0) self.assertTrue(records[-1].timestamp() - records[0].timestamp() >= monitoring_time) dcgm_monitor.destroy()
def test_immediate_start_stop(self): frequency = 1 metrics = [GPUUsedMemory, GPUFreeMemory] dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics) dcgm_monitor.start_recording_metrics() dcgm_monitor.stop_recording_metrics() dcgm_monitor.destroy()
def test_record_memory(self): # One measurement every 0.01 seconds frequency = 0.01 monitoring_time = 10 metrics = [GPUUsedMemory, GPUFreeMemory] gpus = ['all'] dcgm_monitor = DCGMMonitor(gpus, frequency, metrics) dcgm_monitor.start_recording_metrics() time.sleep(monitoring_time) records = dcgm_monitor.stop_recording_metrics() # Assert instance types for record in records: self.assertIsInstance(record.device(), GPUDevice) self.assertIsInstance(record.value(), float) self.assertTrue(record.value() == TEST_RECORD_VALUE) self.assertIsInstance(record.timestamp(), int) # The number of records should be dividable by number of metrics self.assertTrue(len(records) % len(metrics) == 0) self.assertTrue(len(records) > 0) self.assertTrue(records[-1].timestamp() - records[0].timestamp() >= monitoring_time) with self.assertRaises(TritonModelAnalyzerException): dcgm_monitor.stop_recording_metrics() dcgm_monitor.destroy() metrics = ['UndefinedTag'] with self.assertRaises(TritonModelAnalyzerException): DCGMMonitor(gpus, frequency, metrics)
def test_immediate_start_stop(self): frequency = 0.01 tags = [GPUUsedMemory, GPUFreeMemory] gpus = ['all'] dcgm_monitor = DCGMMonitor(gpus, frequency, tags) dcgm_monitor.start_recording_metrics() dcgm_monitor.stop_recording_metrics() dcgm_monitor.destroy()
def test_gpu_id(self): frequency = 0.01 metrics = [GPUUsedMemory, GPUFreeMemory] gpus = ['UndefinedId'] with self.assertRaises(TritonModelAnalyzerException): DCGMMonitor(gpus, frequency, metrics) gpus = [str(TEST_UUID, encoding='ascii')] dcgm_monitor = DCGMMonitor(gpus, frequency, metrics) dcgm_monitor.destroy()
def test_record_power(self): # One measurement every 0.01 seconds frequency = 1 monitoring_time = 2 metrics = [GPUPowerUsage] dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics) dcgm_monitor.start_recording_metrics() time.sleep(monitoring_time) records = dcgm_monitor.stop_recording_metrics() # Assert instance types for record in records: self.assertIsInstance(record.device_uuid(), str) self.assertIsInstance(record.value(), float) self.assertTrue(record.value() == TEST_RECORD_VALUE) self.assertIsInstance(record.timestamp(), int) # The number of records should be dividable by number of metrics self.assertTrue(len(records) % len(metrics) == 0) self.assertTrue(len(records) > 0) self.assertTrue(records[-1].timestamp() - records[0].timestamp() >= monitoring_time) dcgm_monitor.destroy()
class MetricsManager: """ This class handles the profiling categorization of metrics """ def __init__(self, config, metric_tags, server, result_manager): """ Parameters ---------- config : AnalyzerConfig The model analyzer's config metric_tags : List of str The list of tags corresponding to the metric types to monitor. server : TritonServer Handle to the instance of Triton being used result_manager : ResultManager instance that manages the result tables and adding results """ self._server = server self._gpus = config.gpus self._monitoring_interval = config.monitoring_interval self._perf_analyzer_path = config.perf_analyzer_path self._config = config self._result_manager = result_manager self._dcgm_metrics = [] self._perf_metrics = [] self._cpu_metrics = [] self._create_metric_tables(metrics=MetricsManager.get_metric_types( tags=metric_tags)) def _create_metric_tables(self, metrics): """ Splits up monitoring metrics into various categories, defined in ___init___ and requests result manager to make corresponding table """ # Separates metrics and objectives into related lists for metric in metrics: if metric in DCGMMonitor.model_analyzer_to_dcgm_field: self._dcgm_metrics.append(metric) elif metric in PerfAnalyzer.perf_metrics: self._perf_metrics.append(metric) elif metric in CPUMonitor.cpu_metrics: self._cpu_metrics.append(metric) self._result_manager.create_tables( gpu_specific_metrics=self._dcgm_metrics, non_gpu_specific_metrics=self._perf_metrics + self._cpu_metrics) def profile_server(self): """ Runs the DCGM monitor on the triton server without the perf_analyzer Raises ------ TritonModelAnalyzerException """ self._start_monitors() server_gpu_metrics = self._get_gpu_inference_metrics() self._result_manager.add_server_data(data=server_gpu_metrics) def profile_model(self, perf_config, perf_output_writer=None): """ Runs monitors while running perf_analyzer with a specific set of arguments. This will profile model inferencing. Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written Returns ------- (dict of lists, list) The gpu specific and non gpu metrics """ # Start monitors and run perf_analyzer self._start_monitors() perf_analyzer_metrics_or_status = self._get_perf_analyzer_metrics( perf_config, perf_output_writer) # Failed Status if perf_analyzer_metrics_or_status == 1: self._stop_monitors() self._destroy_monitors() return None, None else: perf_analyzer_metrics = perf_analyzer_metrics_or_status # Get metrics for model inference and combine metrics that do not have GPU ID model_gpu_metrics = self._get_gpu_inference_metrics() model_cpu_metrics = self._get_cpu_inference_metrics() model_non_gpu_metrics = list(perf_analyzer_metrics.values()) + list( model_cpu_metrics.values()) return model_gpu_metrics, model_non_gpu_metrics def _start_monitors(self): """ Start any metrics monitors """ self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval, self._dcgm_metrics) self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval, self._cpu_metrics) self._dcgm_monitor.start_recording_metrics() self._cpu_monitor.start_recording_metrics() def _stop_monitors(self): """ Stop any metrics monitors """ self._dcgm_monitor.stop_recording_metrics() self._cpu_monitor.stop_recording_metrics() def _destroy_monitors(self): """ Destroy the monitors created by start """ self._dcgm_monitor.destroy() self._cpu_monitor.destroy() def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None): """ Gets the aggregated metrics from the perf_analyzer Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written Raises ------ TritonModelAnalyzerException """ try: perf_analyzer = PerfAnalyzer( path=self._perf_analyzer_path, config=perf_config, timeout=self._config.perf_analyzer_timeout, max_cpu_util=self._config.perf_analyzer_cpu_util) status = perf_analyzer.run(self._perf_metrics) # PerfAnalzyer run was not succesful if status == 1: return 1 except FileNotFoundError as e: raise TritonModelAnalyzerException( f"perf_analyzer binary not found : {e}") if perf_output_writer: perf_output_writer.write(perf_analyzer.output() + '\n') perf_records = perf_analyzer.get_records() perf_record_aggregator = RecordAggregator() perf_record_aggregator.insert_all(perf_records) return perf_record_aggregator.aggregate() def _get_gpu_inference_metrics(self): """ Stops GPU monitor and aggregates any records that are GPU specific Returns ------- dict keys are gpu ids and values are metric values in the order specified in self._dcgm_metrics """ # Stop and destroy DCGM monitor dcgm_records = self._dcgm_monitor.stop_recording_metrics() self._destroy_monitors() # Insert all records into aggregator and get aggregated DCGM records dcgm_record_aggregator = RecordAggregator() dcgm_record_aggregator.insert_all(dcgm_records) records_groupby_gpu = {} records_groupby_gpu = dcgm_record_aggregator.groupby( self._dcgm_metrics, lambda record: record.device().device_id()) gpu_metrics = defaultdict(list) for _, metric in records_groupby_gpu.items(): for gpu_id, metric_value in metric.items(): gpu_metrics[gpu_id].append(metric_value) return gpu_metrics def _get_cpu_inference_metrics(self): """ Stops any monitors that just need the records to be aggregated like the CPU mmetrics """ cpu_records = self._cpu_monitor.stop_recording_metrics() self._destroy_monitors() cpu_record_aggregator = RecordAggregator() cpu_record_aggregator.insert_all(cpu_records) return cpu_record_aggregator.aggregate() @staticmethod def get_metric_types(tags): """ Parameters ---------- tags : list of str Human readable names for the metrics to monitor. They correspond to actual record types. Returns ------- List of record types being monitored """ return [RecordType.get(tag) for tag in tags]
class MetricsManager: """ This class handles the profiling categorization of metrics """ def __init__(self, config, metric_tags, server, result_manager): """ Parameters ---------- config : AnalyzerConfig The model analyzer's config metric_tags : List of str The list of tags corresponding to the metric types to monitor. server : TritonServer Handle to the instance of Triton being used result_manager : ResultManager instance that manages the result tables and adding results """ self._server = server self._gpus = config.gpus self._monitoring_interval = config.monitoring_interval self._perf_analyzer_path = config.perf_analyzer_path self._result_manager = result_manager self._dcgm_metrics = [] self._perf_metrics = [] self._cpu_metrics = [] self._create_metric_tables(metrics=MetricsManager.get_metric_types( tags=metric_tags)) def _create_metric_tables(self, metrics): """ Splits up monitoring metrics into various categories, defined in ___init___ and requests result manager to make corresponding table """ # Separates metrics and objectives into related lists for metric in metrics: if metric in DCGMMonitor.model_analyzer_to_dcgm_field: self._dcgm_metrics.append(metric) elif metric in PerfAnalyzer.perf_metrics: self._perf_metrics.append(metric) elif metric in CPUMonitor.cpu_metrics: self._cpu_metrics.append(metric) self._result_manager.create_tables( gpu_specific_metrics=self._dcgm_metrics, non_gpu_specific_metrics=self._perf_metrics + self._cpu_metrics, aggregation_tag='Max') def configure_result_manager(self, config_model): """ Processes the constraints and objectives for given ConfigModel and creates a result comparator to pass to the result manager Parameters ---------- config_model : ConfigModel The config model object for the model that is currently being run """ constraints = {} # Construct dict of record types for objectives and constraints objective_tags = list(config_model.objectives().keys()) objective_metrics = MetricsManager.get_metric_types( tags=objective_tags) objectives = { objective_metrics[i]: config_model.objectives()[objective_tags[i]] for i in range(len(objective_tags)) } # Constraints may be empty if config_model.constraints(): constraint_tags = list(config_model.constraints().keys()) constraint_metrics = MetricsManager.get_metric_types( tags=constraint_tags) constraints = { constraint_metrics[i]: config_model.constraints()[constraint_tags[i]] for i in range(len(constraint_tags)) } self._result_comparator = ResultComparator( gpu_metric_types=self._dcgm_metrics, non_gpu_metric_types=self._perf_metrics + self._cpu_metrics, metric_objectives=objectives) self._result_manager.set_constraints_and_comparator( constraints=constraints, comparator=self._result_comparator) def profile_server(self, default_value): """ Runs the DCGM monitor on the triton server without the perf_analyzer Parameters ---------- default_value : str The value to fill in for columns in the table that don't apply to profiling server only Raises ------ TritonModelAnalyzerException """ self._start_monitors() server_gpu_metrics = self._get_gpu_inference_metrics() self._result_manager.add_server_data(data=server_gpu_metrics, default_value=default_value) def profile_model(self, perf_config, perf_output_writer=None): """ Runs monitors while running perf_analyzer with a specific set of arguments. This will profile model inferencing. Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written """ # Start monitors and run perf_analyzer self._start_monitors() perf_analyzer_metrics = self._get_perf_analyzer_metrics( perf_config, perf_output_writer) # Get metrics for model inference and combine metrics that do not have GPU ID model_gpu_metrics = self._get_gpu_inference_metrics() model_cpu_metrics = self._get_cpu_inference_metrics() model_non_gpu_metric_values = list( perf_analyzer_metrics.values()) + list(model_cpu_metrics.values()) # Construct a measurement model_measurement = Measurement( gpu_data=model_gpu_metrics, non_gpu_data=model_non_gpu_metric_values, perf_config=perf_config, comparator=self._result_comparator) self._result_manager.add_model_data(measurement=model_measurement) def _start_monitors(self): """ Start any metrics monitors """ self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval, self._dcgm_metrics) self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval, self._cpu_metrics) self._dcgm_monitor.start_recording_metrics() self._cpu_monitor.start_recording_metrics() def _destroy_monitors(self): """ Destroy the monitors created by start """ self._dcgm_monitor.destroy() self._cpu_monitor.destroy() def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None): """ Gets the aggregated metrics from the perf_analyzer Parameters ---------- perf_config : dict The keys are arguments to perf_analyzer The values are their values perf_output_writer : OutputWriter Writer that writes the output from perf_analyzer to the output stream/file. If None, the output is not written Raises ------ TritonModelAnalyzerException """ try: perf_analyzer = PerfAnalyzer(path=self._perf_analyzer_path, config=perf_config) perf_analyzer.run(self._perf_metrics) except FileNotFoundError as e: raise TritonModelAnalyzerException( f"perf_analyzer binary not found : {e}") if perf_output_writer: perf_output_writer.write(perf_analyzer.output() + '\n') perf_records = perf_analyzer.get_records() perf_record_aggregator = RecordAggregator() perf_record_aggregator.insert_all(perf_records) return perf_record_aggregator.aggregate() def _get_gpu_inference_metrics(self): """ Stops GPU monitor and aggregates any records that are GPU specific Returns ------- dict keys are gpu ids and values are metric values in the order specified in self._dcgm_metrics """ # Stop and destroy DCGM monitor dcgm_records = self._dcgm_monitor.stop_recording_metrics() self._destroy_monitors() # Insert all records into aggregator and get aggregated DCGM records dcgm_record_aggregator = RecordAggregator() dcgm_record_aggregator.insert_all(dcgm_records) records_groupby_gpu = {} records_groupby_gpu = dcgm_record_aggregator.groupby( self._dcgm_metrics, lambda record: record.device().device_id()) gpu_metrics = defaultdict(list) for _, metric in records_groupby_gpu.items(): for gpu_id, metric_value in metric.items(): gpu_metrics[gpu_id].append(metric_value) return gpu_metrics def _get_cpu_inference_metrics(self): """ Stops any monitors that just need the records to be aggregated like the CPU mmetrics """ cpu_records = self._cpu_monitor.stop_recording_metrics() self._destroy_monitors() cpu_record_aggregator = RecordAggregator() cpu_record_aggregator.insert_all(cpu_records) return cpu_record_aggregator.aggregate() @staticmethod def get_metric_types(tags): """ Parameters ---------- tags : list of str Human readable names for the metrics to monitor. They correspond to actual record types. Returns ------- List of record types being monitored """ return [RecordType.get(tag) for tag in tags]