def test_collect_platform_information(*mocks): got_platform, got_metrics, got_labels = collect_platform_information( include_optional_labels=True ) assert got_platform == Platform( sockets=1, cores=1, cpus=2, numa_nodes=1, topology={}, cpu_model='intel xeon', cpu_model_number=0x5E, cpu_codename=CPUCodeName.SKYLAKE, timestamp=1536071557.123456, # timestamp, node_cpus={}, node_distances={}, rdt_information=RDTInformation(True, True, True, True, 'fffff', '2', 8, 10, 20), measurements={MetricName.PLATFORM_CPU_USAGE: {0: 100, 1: 200}, MetricName.PLATFORM_MEM_USAGE_BYTES: 1337, MetricName.PLATFORM_MEM_NUMA_FREE_BYTES: {0: 1}, MetricName.PLATFORM_MEM_NUMA_USED_BYTES: {0: 2}, MetricName.PLATFORM_VMSTAT_NUMA_PAGES_MIGRATED: 5, }, swap_enabled=False ) assert_metric(got_metrics, MetricName.PLATFORM_MEM_USAGE_BYTES, expected_metric_value=1337) assert_metric(got_metrics, MetricName.PLATFORM_CPU_USAGE, {'cpu': '0'}, expected_metric_value=100) assert_metric(got_metrics, MetricName.PLATFORM_TOPOLOGY_CORES, expected_metric_value=1) assert_metric(got_metrics, MetricName.PLATFORM_VMSTAT_NUMA_PAGES_MIGRATED, expected_metric_value=5) assert got_labels == {"sockets": "1", "cores": "1", "cpus": "2", "host": "test_host", "wca_version": "0.1", "cpu_model": "intel xeon"}
def _iterate(self): iteration_start = time.time() # Get information about tasks. try: tasks = self._node.get_tasks() except TaskSynchronizationException as e: log.error('Cannot synchronize tasks with node (error=%s) - skip this iteration!', e) self._wait() return append_additional_labels_to_tasks(self._task_label_generators, tasks) log.debug('Tasks detected: %d', len(tasks)) # Keep sync of found tasks and internally managed containers. containers = self._containers_manager.sync_containers_state(tasks) log.log(TRACE, 'Tasks container mapping:\n%s', '\n'.join( ['%s(%s) = %s' % (task.name, task.task_id, container._cgroup_path) for task, container in containers.items()])) # @TODO why not in platform module? extra_platform_measurements = self._uncore_get_measurements() # Platform information platform, platform_metrics, platform_labels = platforms.collect_platform_information( self._rdt_enabled, self._gather_hw_mm_topology, extra_platform_measurements=extra_platform_measurements, include_optional_labels=False, ) # Common labels common_labels = dict(platform_labels, **self._extra_labels) try: tasks_data = _prepare_tasks_data(containers) except MissingMeasurementException as e: log.error('Cannot synchronize tasks measurements (error=%s) - skip this iteration!', e) self._wait() return # Inject other runners code. if self._iterate_body_callback is not None: self._iterate_body_callback(containers, platform, tasks_data, common_labels) self._wait() iteration_duration = time.time() - iteration_start profiling.profiler.register_duration('iteration', iteration_duration) # Generic metrics. metrics_package = MetricPackage(self._metrics_storage) metrics_package.add_metrics(_get_internal_metrics(tasks)) metrics_package.add_metrics(platform_metrics) metrics_package.add_metrics(_build_tasks_metrics(tasks_data)) metrics_package.add_metrics(profiling.profiler.get_metrics()) metrics_package.add_metrics(get_logging_metrics()) metrics_package.send(common_labels)
def _initialize_rdt(self) -> bool: platform, _, _ = platforms.collect_platform_information() # Cache control check. if self._rdt_cache_control_required and \ not platform.rdt_information.rdt_cache_control_enabled: # Wanted unavailable feature - halt log.error('RDT cache control enabled but is not supported by platform!') return False # MB control check. if self._rdt_mb_control_required and \ not platform.rdt_information.rdt_mb_control_enabled: # Some wanted unavailable feature - halt. log.error('RDT memory bandwidth enabled but ' 'allocation is not supported by platform!') return False # Prepare initial values for L3, MB... root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values( platform.rdt_information.cbm_mask, platform.sockets, platform.rdt_information.rdt_mb_control_enabled, platform.rdt_information.rdt_cache_control_enabled ) # ...override max values with values from allocation configuration if self._allocation_configuration.default_rdt_l3 is not None and \ platform.rdt_information.rdt_cache_control_enabled: root_rdt_l3 = self._allocation_configuration.default_rdt_l3 if self._allocation_configuration.default_rdt_mb is not None and \ platform.rdt_information.rdt_mb_control_enabled: root_rdt_mb = self._allocation_configuration.default_rdt_mb try: if root_rdt_l3 is not None: validate_l3_string(root_rdt_l3, platform.sockets, platform.rdt_information.cbm_mask, platform.rdt_information.min_cbm_bits) if root_rdt_mb is not None: normalized_root_rdt_mb = normalize_mb_string( root_rdt_mb, platform.sockets, platform.rdt_information.mb_min_bandwidth, platform.rdt_information.mb_bandwidth_gran) resctrl.cleanup_resctrl( root_rdt_l3, normalized_root_rdt_mb, self._remove_all_resctrl_groups) else: resctrl.cleanup_resctrl( root_rdt_l3, root_rdt_mb, self._remove_all_resctrl_groups) except InvalidAllocations as e: log.error('Cannot initialize RDT subsystem: %s', e) return False return True
def _initialize(self) -> Optional[int]: """Check privileges, RDT availability and prepare internal state. Can return error code that should stop Runner. """ if not security.are_privileges_sufficient(): log.error( "Insufficient privileges! " "Impossible to use perf_event_open/resctrl subsystems. " "For unprivileged user it is needed to: " "adjust /proc/sys/kernel/perf_event_paranoid (set to -1), " "has CAP_DAC_OVERRIDE and CAP_SETUID capabilities and" "SECBIT_NO_SETUID_FIXUP secure bit set.") return 1 # Initialization (auto discovery Intel RDT features). rdt_available = resctrl.check_resctrl() if self._rdt_enabled is None: self._rdt_enabled = rdt_available log.info('RDT enabled (auto configuration): %s', self._rdt_enabled) elif self._rdt_enabled is True and not rdt_available: log.error('RDT explicitly enabled but not available - exiting!') return 1 if self._rdt_enabled: # Resctrl is enabled and available, call a placeholder to allow further initialization. rdt_initialization_ok = self._initialize_rdt() if not rdt_initialization_ok: return 1 # Postpone the container manager initialization after rdt checks were performed. platform_cpus, _, platform_sockets = platforms.collect_topology_information( ) platform, _, _ = platforms.collect_platform_information( self._rdt_enabled) rdt_information = platform.rdt_information self._event_names = _filter_out_event_names_for_cpu( self._event_names, platform.cpu_codename) # We currently do not support RDT without monitoring. if self._rdt_enabled and not rdt_information.is_monitoring_enabled(): log.error('RDT monitoring is required - please enable CAT ' 'or MBM with kernel parameters!') return 1 self._containers_manager = ContainerManager( platform=platform, allocation_configuration=self._allocation_configuration, event_names=self._event_names, enable_derived_metrics=self._enable_derived_metrics, ) return None
def _iterate(self): iteration_start = time.time() # Get information about tasks. try: tasks = self._node.get_tasks() except TaskSynchronizationException as e: log.error( 'Cannot synchronize tasks with node (error=%s) - skip this iteration!', e) self._wait() return append_additional_labels_to_tasks(self._task_label_generators, tasks) log.debug('Tasks detected: %d', len(tasks)) # Keep sync of found tasks and internally managed containers. containers = self._containers_manager.sync_containers_state(tasks) # Platform information platform, platform_metrics, platform_labels = platforms.collect_platform_information( self._rdt_enabled) # Common labels common_labels = dict(platform_labels, **self._extra_labels) # Tasks data tasks_measurements, tasks_resources, tasks_labels = _prepare_tasks_data( containers) tasks_metrics = _build_tasks_metrics(tasks_labels, tasks_measurements) self._iterate_body(containers, platform, tasks_measurements, tasks_resources, tasks_labels, common_labels) self._wait() iteration_duration = time.time() - iteration_start profiling.profiler.register_duration('iteration', iteration_duration) # Generic metrics. metrics_package = MetricPackage(self._metrics_storage) metrics_package.add_metrics(_get_internal_metrics(tasks)) metrics_package.add_metrics(platform_metrics) metrics_package.add_metrics(tasks_metrics) metrics_package.add_metrics(profiling.profiler.get_metrics()) metrics_package.add_metrics(get_logging_metrics()) metrics_package.send(common_labels)
def _iterate(self): iteration_start = time.time() # Get information about tasks. tasks = self._node.get_tasks() log.debug('Tasks detected: %d', len(tasks)) for task in tasks: sanitized_labels = dict() for label_key, label_value in task.labels.items(): sanitized_labels.update({sanitize_label(label_key): label_value}) task.labels = sanitized_labels # Keep sync of found tasks and internally managed containers. containers = self._containers_manager.sync_containers_state(tasks) # Platform information platform, platform_metrics, platform_labels = platforms.collect_platform_information( self._rdt_enabled) # Common labels common_labels = dict(platform_labels, **self._extra_labels) # Tasks data tasks_measurements, tasks_resources, tasks_labels = _prepare_tasks_data(containers) tasks_metrics = _build_tasks_metrics(tasks_labels, tasks_measurements) self._iterate_body(containers, platform, tasks_measurements, tasks_resources, tasks_labels, common_labels) self._wait() iteration_duration = time.time() - iteration_start profiling.profiler.register_duration('iteration', iteration_duration) # Generic metrics. metrics_package = MetricPackage(self._metrics_storage) metrics_package.add_metrics(_get_internal_metrics(tasks)) metrics_package.add_metrics(platform_metrics) metrics_package.add_metrics(tasks_metrics) metrics_package.add_metrics(profiling.profiler.get_metrics()) metrics_package.add_metrics(get_logging_metrics()) metrics_package.send(common_labels)
def _initialize(self) -> Optional[int]: """Check RDT availability, privileges and prepare internal state. Can return error code that should stop Runner. """ # Initialization (auto discovery Intel RDT features). rdt_available = resctrl.check_resctrl() if self._rdt_enabled is None: self._rdt_enabled = rdt_available log.info('RDT enabled (auto configuration): %s', self._rdt_enabled) elif self._rdt_enabled is True and not rdt_available: log.error('RDT explicitly enabled but not available - exiting!') return 1 # _allocation_configuration is set in allocation mode (AllocationRunner) # so we need access to write in cgroups. write_to_cgroup = self._allocation_configuration is not None use_resctrl = self._rdt_enabled use_perf = len(self._event_names) > 0 if not security.are_privileges_sufficient(write_to_cgroup, use_resctrl, use_perf): return 1 if self._rdt_enabled: # Resctrl is enabled and available, call a placeholder to allow further initialization. # For MeasurementRunner it's nothing to configure in RDT to measure resource usage. # Check if it's needed to specific rdt initialization in case # of using MeasurementRunner functionality in other runner. if self._initialize_rdt_callback is not None: rdt_initialization_ok = self._initialize_rdt_callback() if not rdt_initialization_ok: return 1 log.debug('rdt_enabled: %s', self._rdt_enabled) log.debug('gather_hw_mm_topology: %s', self._gather_hw_mm_topology) platform, _, _ = platforms.collect_platform_information( self._rdt_enabled, gather_hw_mm_topology=self._gather_hw_mm_topology ) rdt_information = platform.rdt_information self._event_names = _filter_out_event_names_for_cpu( self._event_names, platform.cpu_codename) # We currently do not support RDT without monitoring. if self._rdt_enabled and not rdt_information.is_monitoring_enabled(): log.error('RDT monitoring is required - please enable CAT ' 'or MBM with kernel parameters!') return 1 self._containers_manager = ContainerManager( platform=platform, allocation_configuration=self._allocation_configuration, event_names=self._event_names, enable_derived_metrics=self._enable_derived_metrics, wss_reset_interval=self._wss_reset_interval, perf_aggregate_cpus=self._perf_aggregate_cpus ) self._init_uncore_pmu(self._enable_derived_metrics, self._enable_perf_uncore, platform) return None
def _initialize(self) -> Optional[int]: """Check RDT availability, privileges and prepare internal state. Can return error code that should stop Runner. Flow: - Conclude requirements based on configuration - Conclude required features based on auto discovery - confront user expectations from configuration file with resctrl fs and security access - check RDT HW monitoring features availability """ resctrl_available = resctrl.check_resctrl() # If enabled explicitly check resctrl availability right now. if self._rdt_enabled is True and not resctrl_available: log.error('RDT explicitly enabled but resctrl fs not available - exiting!') return 1 # Auto discovery Intel RDT features. if self._rdt_enabled is None: # Assume yes temporary - but will check monitoring/access later. log.debug('Enable RDT auto discovery (resctrl availability=%s)', resctrl_available) self._rdt_enabled = resctrl_available rdt_auto_enabling = True else: rdt_auto_enabling = False log.debug('gather_hw_mm_topology: %s', self._gather_hw_mm_topology) platform, _, _ = platforms.collect_platform_information( resctrl_available, gather_hw_mm_topology=self._gather_hw_mm_topology ) # Confront RDT (resctrl fs) with HW enabled monitoring features. if self._rdt_enabled and not platform.rdt_information.is_monitoring_enabled(): # Note: WCA does not support RDT without monitoring (keeps a mapping of # cgroups and resctrl groups). msg = ('Resctrl is available but RDT monitoring features are not!' + 'Please enable CMT or MBM with kernel parameters (monitoring is ' + 'required for CAT or MBA allocation)!') if rdt_auto_enabling: log.debug(msg) self._rdt_enabled = False platform.rdt_information = None # override rdt information should not be available later # e.g. ContainerManager else: # If RDT was force fail short here. log.error(msg) return 1 # All RDT checks (security/check) done - show info and call initialization callback. log.info('RDT: %s %s', 'enabled' if self._rdt_enabled else 'disabled', ' (auto discovery)' if rdt_auto_enabling else '', ) # Event names (perf cgroups) self._event_names = filter_out_event_names_for_cpu( self._event_names, platform.cpu_codename) log.info('Enabling %i perf events (for cgroups).', len(self._event_names)) log.debug('Enabling perf events: %s', ', '.join(self._event_names)) # Check and assume most popular number of available number of HW counters. if self._event_names: if not check_perf_event_count_limit(self._event_names, platform.cpus, platform.cores): return 1 # _allocation_configuration is set in allocation mode (AllocationRunner) # so we need access to write in cgroups. write_to_cgroup = self._allocation_configuration is not None use_perf = len(self._event_names) > 0 # Check we have enough access. if not security.are_privileges_sufficient(write_to_cgroup, self._rdt_enabled, use_perf): return 1 # Resctrl is enabled and available, call a placeholder to allow further initialization. # For "measurement mode" it's nothing to configure in RDT. # Check if it's needed to specific rdt initialization in case # of using "MeasurementRunner" as component functionality in other runners e.g. Allocation. if self._rdt_enabled: if self._initialize_rdt_callback is not None: rdt_initialization_ok = self._initialize_rdt_callback() if not rdt_initialization_ok: return 1 self._containers_manager = ContainerManager( platform=platform, allocation_configuration=self._allocation_configuration, event_names=self._event_names, enable_derived_metrics=self._enable_derived_metrics, wss_reset_cycles=self._wss_reset_cycles, wss_stable_cycles=self._wss_stable_cycles, wss_membw_threshold=self._wss_membw_threshold, perf_aggregate_cpus=self._perf_aggregate_cpus, interval=self._interval, sched=self._sched, ) log.log(TRACE, 'container manager config: %s', self._containers_manager.__dict__) self._init_uncore_pmu_events(self._enable_derived_metrics, self._uncore_events, platform) return None