def test_collect_platform_information(*mocks):
    got_platform, got_metrics, got_labels = collect_platform_information(
        include_optional_labels=True
    )

    assert got_platform == Platform(
        sockets=1,
        cores=1,
        cpus=2,
        numa_nodes=1,
        topology={},
        cpu_model='intel xeon',
        cpu_model_number=0x5E,
        cpu_codename=CPUCodeName.SKYLAKE,
        timestamp=1536071557.123456,  # timestamp,
        node_cpus={},
        node_distances={},
        rdt_information=RDTInformation(True, True, True, True, 'fffff', '2', 8, 10, 20),
        measurements={MetricName.PLATFORM_CPU_USAGE: {0: 100, 1: 200},
                      MetricName.PLATFORM_MEM_USAGE_BYTES: 1337,
                      MetricName.PLATFORM_MEM_NUMA_FREE_BYTES: {0: 1},
                      MetricName.PLATFORM_MEM_NUMA_USED_BYTES: {0: 2},
                      MetricName.PLATFORM_VMSTAT_NUMA_PAGES_MIGRATED: 5,
                      },
        swap_enabled=False
    )

    assert_metric(got_metrics, MetricName.PLATFORM_MEM_USAGE_BYTES, expected_metric_value=1337)
    assert_metric(got_metrics, MetricName.PLATFORM_CPU_USAGE, {'cpu': '0'},
                  expected_metric_value=100)
    assert_metric(got_metrics, MetricName.PLATFORM_TOPOLOGY_CORES, expected_metric_value=1)
    assert_metric(got_metrics, MetricName.PLATFORM_VMSTAT_NUMA_PAGES_MIGRATED,
                  expected_metric_value=5)
    assert got_labels == {"sockets": "1", "cores": "1", "cpus": "2", "host": "test_host",
                          "wca_version": "0.1", "cpu_model": "intel xeon"}
    def _iterate(self):
        iteration_start = time.time()

        # Get information about tasks.
        try:
            tasks = self._node.get_tasks()
        except TaskSynchronizationException as e:
            log.error('Cannot synchronize tasks with node (error=%s) - skip this iteration!', e)
            self._wait()
            return

        append_additional_labels_to_tasks(self._task_label_generators, tasks)
        log.debug('Tasks detected: %d', len(tasks))

        # Keep sync of found tasks and internally managed containers.
        containers = self._containers_manager.sync_containers_state(tasks)
        log.log(TRACE, 'Tasks container mapping:\n%s', '\n'.join(
            ['%s(%s)  =  %s' % (task.name, task.task_id, container._cgroup_path) for task, container
             in containers.items()]))

        # @TODO why not in platform module?
        extra_platform_measurements = self._uncore_get_measurements()

        # Platform information
        platform, platform_metrics, platform_labels = platforms.collect_platform_information(
            self._rdt_enabled, self._gather_hw_mm_topology,
            extra_platform_measurements=extra_platform_measurements,
            include_optional_labels=False,
        )

        # Common labels
        common_labels = dict(platform_labels, **self._extra_labels)

        try:
            tasks_data = _prepare_tasks_data(containers)
        except MissingMeasurementException as e:
            log.error('Cannot synchronize tasks measurements (error=%s) - skip this iteration!', e)
            self._wait()
            return

        # Inject other runners code.
        if self._iterate_body_callback is not None:
            self._iterate_body_callback(containers, platform, tasks_data, common_labels)

        self._wait()

        iteration_duration = time.time() - iteration_start
        profiling.profiler.register_duration('iteration', iteration_duration)

        # Generic metrics.
        metrics_package = MetricPackage(self._metrics_storage)
        metrics_package.add_metrics(_get_internal_metrics(tasks))
        metrics_package.add_metrics(platform_metrics)
        metrics_package.add_metrics(_build_tasks_metrics(tasks_data))
        metrics_package.add_metrics(profiling.profiler.get_metrics())
        metrics_package.add_metrics(get_logging_metrics())
        metrics_package.send(common_labels)
Ejemplo n.º 3
0
    def _initialize_rdt(self) -> bool:
        platform, _, _ = platforms.collect_platform_information()

        # Cache control check.
        if self._rdt_cache_control_required and \
                not platform.rdt_information.rdt_cache_control_enabled:
            # Wanted unavailable feature - halt
            log.error('RDT cache control enabled but is not supported by platform!')
            return False

        # MB control check.
        if self._rdt_mb_control_required and \
                not platform.rdt_information.rdt_mb_control_enabled:
            # Some wanted unavailable feature - halt.
            log.error('RDT memory bandwidth enabled but '
                      'allocation is not supported by platform!')
            return False

        # Prepare initial values for L3, MB...
        root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values(
            platform.rdt_information.cbm_mask,
            platform.sockets,
            platform.rdt_information.rdt_mb_control_enabled,
            platform.rdt_information.rdt_cache_control_enabled
        )

        # ...override max values with values from allocation configuration
        if self._allocation_configuration.default_rdt_l3 is not None and \
                platform.rdt_information.rdt_cache_control_enabled:
            root_rdt_l3 = self._allocation_configuration.default_rdt_l3
        if self._allocation_configuration.default_rdt_mb is not None and \
                platform.rdt_information.rdt_mb_control_enabled:
            root_rdt_mb = self._allocation_configuration.default_rdt_mb

        try:
            if root_rdt_l3 is not None:
                validate_l3_string(root_rdt_l3, platform.sockets,
                                   platform.rdt_information.cbm_mask,
                                   platform.rdt_information.min_cbm_bits)

            if root_rdt_mb is not None:
                normalized_root_rdt_mb = normalize_mb_string(
                        root_rdt_mb,
                        platform.sockets,
                        platform.rdt_information.mb_min_bandwidth,
                        platform.rdt_information.mb_bandwidth_gran)
                resctrl.cleanup_resctrl(
                        root_rdt_l3, normalized_root_rdt_mb, self._remove_all_resctrl_groups)
            else:
                resctrl.cleanup_resctrl(
                        root_rdt_l3, root_rdt_mb, self._remove_all_resctrl_groups)
        except InvalidAllocations as e:
            log.error('Cannot initialize RDT subsystem: %s', e)
            return False

        return True
Ejemplo n.º 4
0
    def _initialize(self) -> Optional[int]:
        """Check privileges, RDT availability and prepare internal state.
        Can return error code that should stop Runner.
        """
        if not security.are_privileges_sufficient():
            log.error(
                "Insufficient privileges! "
                "Impossible to use perf_event_open/resctrl subsystems. "
                "For unprivileged user it is needed to: "
                "adjust /proc/sys/kernel/perf_event_paranoid (set to -1), "
                "has CAP_DAC_OVERRIDE and CAP_SETUID capabilities and"
                "SECBIT_NO_SETUID_FIXUP secure bit set.")
            return 1

        # Initialization (auto discovery Intel RDT features).

        rdt_available = resctrl.check_resctrl()
        if self._rdt_enabled is None:
            self._rdt_enabled = rdt_available
            log.info('RDT enabled (auto configuration): %s', self._rdt_enabled)
        elif self._rdt_enabled is True and not rdt_available:
            log.error('RDT explicitly enabled but not available - exiting!')
            return 1

        if self._rdt_enabled:
            # Resctrl is enabled and available, call a placeholder to allow further initialization.
            rdt_initialization_ok = self._initialize_rdt()
            if not rdt_initialization_ok:
                return 1

        # Postpone the container manager initialization after rdt checks were performed.
        platform_cpus, _, platform_sockets = platforms.collect_topology_information(
        )

        platform, _, _ = platforms.collect_platform_information(
            self._rdt_enabled)
        rdt_information = platform.rdt_information

        self._event_names = _filter_out_event_names_for_cpu(
            self._event_names, platform.cpu_codename)

        # We currently do not support RDT without monitoring.
        if self._rdt_enabled and not rdt_information.is_monitoring_enabled():
            log.error('RDT monitoring is required - please enable CAT '
                      'or MBM with kernel parameters!')
            return 1

        self._containers_manager = ContainerManager(
            platform=platform,
            allocation_configuration=self._allocation_configuration,
            event_names=self._event_names,
            enable_derived_metrics=self._enable_derived_metrics,
        )
        return None
Ejemplo n.º 5
0
    def _iterate(self):
        iteration_start = time.time()

        # Get information about tasks.
        try:
            tasks = self._node.get_tasks()
        except TaskSynchronizationException as e:
            log.error(
                'Cannot synchronize tasks with node (error=%s) - skip this iteration!',
                e)
            self._wait()
            return

        append_additional_labels_to_tasks(self._task_label_generators, tasks)
        log.debug('Tasks detected: %d', len(tasks))

        # Keep sync of found tasks and internally managed containers.
        containers = self._containers_manager.sync_containers_state(tasks)

        # Platform information
        platform, platform_metrics, platform_labels = platforms.collect_platform_information(
            self._rdt_enabled)

        # Common labels
        common_labels = dict(platform_labels, **self._extra_labels)

        # Tasks data
        tasks_measurements, tasks_resources, tasks_labels = _prepare_tasks_data(
            containers)
        tasks_metrics = _build_tasks_metrics(tasks_labels, tasks_measurements)

        self._iterate_body(containers, platform, tasks_measurements,
                           tasks_resources, tasks_labels, common_labels)

        self._wait()

        iteration_duration = time.time() - iteration_start
        profiling.profiler.register_duration('iteration', iteration_duration)

        # Generic metrics.
        metrics_package = MetricPackage(self._metrics_storage)
        metrics_package.add_metrics(_get_internal_metrics(tasks))
        metrics_package.add_metrics(platform_metrics)
        metrics_package.add_metrics(tasks_metrics)
        metrics_package.add_metrics(profiling.profiler.get_metrics())
        metrics_package.add_metrics(get_logging_metrics())
        metrics_package.send(common_labels)
    def _iterate(self):
        iteration_start = time.time()

        # Get information about tasks.
        tasks = self._node.get_tasks()
        log.debug('Tasks detected: %d', len(tasks))

        for task in tasks:
            sanitized_labels = dict()
            for label_key, label_value in task.labels.items():
                sanitized_labels.update({sanitize_label(label_key):
                                         label_value})
            task.labels = sanitized_labels

        # Keep sync of found tasks and internally managed containers.
        containers = self._containers_manager.sync_containers_state(tasks)

        # Platform information
        platform, platform_metrics, platform_labels = platforms.collect_platform_information(
            self._rdt_enabled)

        # Common labels
        common_labels = dict(platform_labels, **self._extra_labels)

        # Tasks data
        tasks_measurements, tasks_resources, tasks_labels = _prepare_tasks_data(containers)
        tasks_metrics = _build_tasks_metrics(tasks_labels, tasks_measurements)

        self._iterate_body(containers, platform, tasks_measurements, tasks_resources,
                           tasks_labels, common_labels)

        self._wait()

        iteration_duration = time.time() - iteration_start
        profiling.profiler.register_duration('iteration', iteration_duration)

        # Generic metrics.
        metrics_package = MetricPackage(self._metrics_storage)
        metrics_package.add_metrics(_get_internal_metrics(tasks))
        metrics_package.add_metrics(platform_metrics)
        metrics_package.add_metrics(tasks_metrics)
        metrics_package.add_metrics(profiling.profiler.get_metrics())
        metrics_package.add_metrics(get_logging_metrics())
        metrics_package.send(common_labels)
    def _initialize(self) -> Optional[int]:
        """Check RDT availability, privileges and prepare internal state.
        Can return error code that should stop Runner.
        """

        # Initialization (auto discovery Intel RDT features).
        rdt_available = resctrl.check_resctrl()
        if self._rdt_enabled is None:
            self._rdt_enabled = rdt_available
            log.info('RDT enabled (auto configuration): %s', self._rdt_enabled)
        elif self._rdt_enabled is True and not rdt_available:
            log.error('RDT explicitly enabled but not available - exiting!')
            return 1

        # _allocation_configuration is set in allocation mode (AllocationRunner)
        # so we need access to write in cgroups.
        write_to_cgroup = self._allocation_configuration is not None
        use_resctrl = self._rdt_enabled
        use_perf = len(self._event_names) > 0

        if not security.are_privileges_sufficient(write_to_cgroup, use_resctrl, use_perf):
            return 1

        if self._rdt_enabled:
            # Resctrl is enabled and available, call a placeholder to allow further initialization.
            # For MeasurementRunner it's nothing to configure in RDT to measure resource usage.

            # Check if it's needed to specific rdt initialization in case
            # of using MeasurementRunner functionality in other runner.
            if self._initialize_rdt_callback is not None:
                rdt_initialization_ok = self._initialize_rdt_callback()

                if not rdt_initialization_ok:
                    return 1

        log.debug('rdt_enabled: %s', self._rdt_enabled)
        log.debug('gather_hw_mm_topology: %s', self._gather_hw_mm_topology)
        platform, _, _ = platforms.collect_platform_information(
            self._rdt_enabled,
            gather_hw_mm_topology=self._gather_hw_mm_topology
        )
        rdt_information = platform.rdt_information

        self._event_names = _filter_out_event_names_for_cpu(
            self._event_names, platform.cpu_codename)

        # We currently do not support RDT without monitoring.
        if self._rdt_enabled and not rdt_information.is_monitoring_enabled():
            log.error('RDT monitoring is required - please enable CAT '
                      'or MBM with kernel parameters!')
            return 1

        self._containers_manager = ContainerManager(
            platform=platform,
            allocation_configuration=self._allocation_configuration,
            event_names=self._event_names,
            enable_derived_metrics=self._enable_derived_metrics,
            wss_reset_interval=self._wss_reset_interval,
            perf_aggregate_cpus=self._perf_aggregate_cpus
        )

        self._init_uncore_pmu(self._enable_derived_metrics, self._enable_perf_uncore, platform)

        return None
Ejemplo n.º 8
0
    def _initialize(self) -> Optional[int]:
        """Check RDT availability, privileges and prepare internal state.
        Can return error code that should stop Runner.

        Flow:
        - Conclude requirements based on configuration
        - Conclude required features based on auto discovery
        - confront user expectations from configuration file with resctrl fs and security access
        - check RDT HW monitoring features availability
        """
        resctrl_available = resctrl.check_resctrl()
        # If enabled explicitly check resctrl availability right now.
        if self._rdt_enabled is True and not resctrl_available:
            log.error('RDT explicitly enabled but resctrl fs not available - exiting!')
            return 1

        # Auto discovery Intel RDT features.
        if self._rdt_enabled is None:
            # Assume yes temporary - but will check monitoring/access later.
            log.debug('Enable RDT auto discovery (resctrl availability=%s)', resctrl_available)
            self._rdt_enabled = resctrl_available
            rdt_auto_enabling = True
        else:
            rdt_auto_enabling = False

        log.debug('gather_hw_mm_topology: %s', self._gather_hw_mm_topology)
        platform, _, _ = platforms.collect_platform_information(
            resctrl_available,
            gather_hw_mm_topology=self._gather_hw_mm_topology
        )

        # Confront RDT (resctrl fs) with HW enabled monitoring features.
        if self._rdt_enabled and not platform.rdt_information.is_monitoring_enabled():
            # Note: WCA does not support RDT without monitoring (keeps a mapping of
            # cgroups and resctrl groups).
            msg = ('Resctrl is available but RDT monitoring features are not!' +
                   'Please enable CMT or MBM with kernel parameters (monitoring is ' +
                   'required for CAT or MBA allocation)!')
            if rdt_auto_enabling:
                log.debug(msg)
                self._rdt_enabled = False
                platform.rdt_information = None
                # override rdt information should not be available later
                # e.g. ContainerManager
            else:
                # If RDT was force fail short here.
                log.error(msg)
                return 1

        # All RDT checks (security/check) done - show info and call initialization callback.
        log.info('RDT: %s %s', 'enabled' if self._rdt_enabled else 'disabled',
                 ' (auto discovery)' if rdt_auto_enabling else '',
                 )

        # Event names (perf cgroups)
        self._event_names = filter_out_event_names_for_cpu(
            self._event_names, platform.cpu_codename)

        log.info('Enabling %i perf events (for cgroups).', len(self._event_names))
        log.debug('Enabling perf events: %s', ', '.join(self._event_names))
        # Check and assume most popular number of available number of HW counters.
        if self._event_names:
            if not check_perf_event_count_limit(self._event_names, platform.cpus, platform.cores):
                return 1

        # _allocation_configuration is set in allocation mode (AllocationRunner)
        # so we need access to write in cgroups.
        write_to_cgroup = self._allocation_configuration is not None
        use_perf = len(self._event_names) > 0
        # Check we have enough access.
        if not security.are_privileges_sufficient(write_to_cgroup, self._rdt_enabled, use_perf):
            return 1

        # Resctrl is enabled and available, call a placeholder to allow further initialization.
        # For "measurement mode" it's nothing to configure in RDT.
        # Check if it's needed to specific rdt initialization in case
        # of using "MeasurementRunner" as component functionality in other runners e.g. Allocation.
        if self._rdt_enabled:
            if self._initialize_rdt_callback is not None:
                rdt_initialization_ok = self._initialize_rdt_callback()

                if not rdt_initialization_ok:
                    return 1

        self._containers_manager = ContainerManager(
            platform=platform,
            allocation_configuration=self._allocation_configuration,
            event_names=self._event_names,
            enable_derived_metrics=self._enable_derived_metrics,
            wss_reset_cycles=self._wss_reset_cycles,
            wss_stable_cycles=self._wss_stable_cycles,
            wss_membw_threshold=self._wss_membw_threshold,
            perf_aggregate_cpus=self._perf_aggregate_cpus,
            interval=self._interval,
            sched=self._sched,
        )
        log.log(TRACE, 'container manager config: %s', self._containers_manager.__dict__)

        self._init_uncore_pmu_events(self._enable_derived_metrics, self._uncore_events, platform)

        return None