コード例 #1
0
    def __init__(self,
                 measurement_runner: MeasurementRunner,
                 allocator: Allocator,
                 allocations_storage: Storage = DEFAULT_STORAGE,
                 anomalies_storage: Storage = DEFAULT_STORAGE,
                 rdt_mb_control_required: bool = False,
                 rdt_cache_control_required: bool = False,
                 remove_all_resctrl_groups: bool = False):

        if not measurement_runner._allocation_configuration:
            measurement_runner._allocation_configuration = AllocationConfiguration(
            )

        self._measurement_runner = measurement_runner

        # Allocation specific.
        self._allocator = allocator
        self._allocations_storage = allocations_storage

        self._rdt_mb_control_required = rdt_mb_control_required
        self._rdt_cache_control_required = rdt_cache_control_required

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()

        # Internal allocation statistics
        self._allocations_counter = 0
        self._allocations_errors = 0

        self._remove_all_resctrl_groups = remove_all_resctrl_groups

        self._measurement_runner._set_iterate_body_callback(self._iterate_body)
        self._measurement_runner._set_initialize_rdt_callback(
            self._initialize_rdt)
コード例 #2
0
    def __init__(
        self,
        node: nodes.Node,
        allocator: Allocator,
        metrics_storage: storage.Storage = DEFAULT_STORAGE,
        anomalies_storage: storage.Storage = DEFAULT_STORAGE,
        allocations_storage: storage.Storage = DEFAULT_STORAGE,
        action_delay: Numeric(0, 60) = 1.,  # [s]
        rdt_enabled: Optional[
            bool] = None,  # Defaults(None) - auto configuration.
        rdt_mb_control_required: bool = False,
        rdt_cache_control_required: bool = False,
        extra_labels: Dict[Str, Str] = None,
        allocation_configuration: Optional[AllocationConfiguration] = None,
        remove_all_resctrl_groups: bool = False,
        event_names: Optional[List[str]] = None,
        enable_derived_metrics: bool = False,
        task_label_generators: Dict[str, TaskLabelGenerator] = None,
    ):

        self._allocation_configuration = allocation_configuration or AllocationConfiguration(
        )

        super().__init__(
            node,
            metrics_storage,
            action_delay,
            rdt_enabled,
            extra_labels,
            _allocation_configuration=self._allocation_configuration,
            event_names=event_names,
            enable_derived_metrics=enable_derived_metrics,
            task_label_generators=task_label_generators)

        # Allocation specific.
        self._allocator = allocator
        self._allocations_storage = allocations_storage
        self._rdt_mb_control_required = rdt_mb_control_required  # Override False from superclass.
        self._rdt_cache_control_required = rdt_cache_control_required

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()

        # Internal allocation statistics
        self._allocations_counter = 0
        self._allocations_errors = 0

        self._remove_all_resctrl_groups = remove_all_resctrl_groups
コード例 #3
0
class AllocationRunner(MeasurementRunner):
    """Runner is responsible for getting information about tasks from node,
    calling allocate() callback on allocator, performing returning allocations
    and storing all allocation related metrics in allocations_storage.

    Because Allocator interface is also detector, we store serialized detected anomalies
    in anomalies_storage and all other measurements in metrics_storage.

    Arguments:
        node: component used for tasks discovery
        allocator: component that provides allocation logic
        metrics_storage: storage to store platform, internal, resource and task metrics
            (defaults to DEFAULT_STORAGE/LogStorage to output for standard error)
        anomalies_storage: storage to store serialized anomalies and extra metrics
            (defaults to DEFAULT_STORAGE/LogStorage to output for standard error)
        allocations_storage: storage to store serialized resource allocations
            (defaults to DEFAULT_STORAGE/LogStorage to output for standard error)
        action_delay: iteration duration in seconds (None disables wait and iterations)
            (defaults to 1 second)
        rdt_enabled: enables or disabled support for RDT monitoring and allocation
            (defaults to None(auto) based on platform capabilities)
        rdt_mb_control_required: indicates that MB control is required,
            if the platform does not support this feature the WCA will exit
        rdt_cache_control_required: indicates tha L3 control is required,
            if the platform does not support this feature the WCA will exit
        extra_labels: additional labels attached to every metric
            (defaults to empty dict)
        allocation_configuration: allows fine grained control over allocations
            (defaults to AllocationConfiguration() instance)
        remove_all_resctrl_groups (bool): remove all RDT controls groups upon starting
            (defaults to False)
        event_names: perf counters to monitor
            (defaults to instructions, cycles, cache-misses, memstalls)
        enable_derived_metrics: enable derived metrics ips, ipc and cache_hit_ratio
            (based on enabled_event names), default to False
        task_label_generators: component to generate additional labels for tasks
    """

    def __init__(
            self,
            node: nodes.Node,
            allocator: Allocator,
            metrics_storage: storage.Storage = DEFAULT_STORAGE,
            anomalies_storage: storage.Storage = DEFAULT_STORAGE,
            allocations_storage: storage.Storage = DEFAULT_STORAGE,
            action_delay: Numeric(0, 60) = 1.,  # [s]
            rdt_enabled: Optional[bool] = None,  # Defaults(None) - auto configuration.
            rdt_mb_control_required: bool = False,
            rdt_cache_control_required: bool = False,
            extra_labels: Dict[Str, Str] = None,
            allocation_configuration: Optional[AllocationConfiguration] = None,
            remove_all_resctrl_groups: bool = False,
            event_names: Optional[List[str]] = DEFAULT_EVENTS,
            enable_derived_metrics: bool = False,
            task_label_generators: Dict[str, TaskLabelGenerator] = None,
    ):

        self._allocation_configuration = allocation_configuration or AllocationConfiguration()

        super().__init__(node, metrics_storage, action_delay, rdt_enabled,
                         extra_labels, _allocation_configuration=self._allocation_configuration,
                         event_names=event_names, enable_derived_metrics=enable_derived_metrics,
                         task_label_generators=task_label_generators)

        # Allocation specific.
        self._allocator = allocator
        self._allocations_storage = allocations_storage
        self._rdt_mb_control_required = rdt_mb_control_required  # Override False from superclass.
        self._rdt_cache_control_required = rdt_cache_control_required

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()

        # Internal allocation statistics
        self._allocations_counter = 0
        self._allocations_errors = 0

        self._remove_all_resctrl_groups = remove_all_resctrl_groups

    def _initialize_rdt(self) -> bool:
        platform, _, _ = platforms.collect_platform_information()

        # Cache control check.
        if self._rdt_cache_control_required and \
                not platform.rdt_information.rdt_cache_control_enabled:
            # Wanted unavailable feature - halt
            log.error('RDT cache control enabled but is not supported by platform!')
            return False

        # MB control check.
        if self._rdt_mb_control_required and \
                not platform.rdt_information.rdt_mb_control_enabled:
            # Some wanted unavailable feature - halt.
            log.error('RDT memory bandwidth enabled but '
                      'allocation is not supported by platform!')
            return False

        # Prepare initial values for L3, MB...
        root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values(
            platform.rdt_information.cbm_mask,
            platform.sockets,
            platform.rdt_information.rdt_mb_control_enabled,
            platform.rdt_information.rdt_cache_control_enabled
        )

        # ...override max values with values from allocation configuration
        if self._allocation_configuration.default_rdt_l3 is not None and \
                platform.rdt_information.rdt_cache_control_enabled:
            root_rdt_l3 = self._allocation_configuration.default_rdt_l3
        if self._allocation_configuration.default_rdt_mb is not None and \
                platform.rdt_information.rdt_mb_control_enabled:
            root_rdt_mb = self._allocation_configuration.default_rdt_mb

        try:
            if root_rdt_l3 is not None:
                validate_l3_string(root_rdt_l3, platform.sockets,
                                   platform.rdt_information.cbm_mask,
                                   platform.rdt_information.min_cbm_bits)

            if root_rdt_mb is not None:
                normalized_root_rdt_mb = normalize_mb_string(
                        root_rdt_mb,
                        platform.sockets,
                        platform.rdt_information.mb_min_bandwidth,
                        platform.rdt_information.mb_bandwidth_gran)
                resctrl.cleanup_resctrl(
                        root_rdt_l3, normalized_root_rdt_mb, self._remove_all_resctrl_groups)
            else:
                resctrl.cleanup_resctrl(
                        root_rdt_l3, root_rdt_mb, self._remove_all_resctrl_groups)
        except InvalidAllocations as e:
            log.error('Cannot initialize RDT subsystem: %s', e)
            return False

        return True

    def _iterate_body(self,
                      containers, platform,
                      tasks_measurements, tasks_resources,
                      tasks_labels, common_labels):
        """Allocator callback body."""

        current_allocations = _get_tasks_allocations(containers)

        # Allocator callback
        allocate_start = time.time()
        new_allocations, anomalies, extra_metrics = self._allocator.allocate(
            platform, tasks_measurements, tasks_resources, tasks_labels,
            current_allocations)
        allocate_duration = time.time() - allocate_start

        # Validate callback output
        _validate_allocate_return_vals(new_allocations, anomalies, extra_metrics)

        log.debug('Anomalies detected: %d', len(anomalies))
        log.debug('Current allocations: %s', current_allocations)

        # Create context aware allocations objects for current allocations.
        current_allocations_values = TasksAllocationsValues.create(
            self._rdt_enabled, current_allocations, self._containers_manager.containers, platform)

        # Handle allocations: calculate changeset and target allocations.
        allocations_changeset_values = None
        target_allocations_values = current_allocations_values
        try:
            # Special validation step needed for Kubernetes.
            validate_shares_allocation_for_kubernetes(tasks=containers.keys(),
                                                      allocations=new_allocations)

            # Create and validate context aware allocations objects for new allocations.
            log.debug('New allocations: %s', new_allocations)
            new_allocations_values = TasksAllocationsValues.create(
                self._rdt_enabled, new_allocations, self._containers_manager.containers, platform)
            new_allocations_values.validate()

            # Calculate changeset and target_allocations.
            if new_allocations_values is not None:
                target_allocations_values, allocations_changeset_values = \
                    new_allocations_values.calculate_changeset(current_allocations_values)
                target_allocations_values.validate()

            self._allocations_counter += len(new_allocations)

        except InvalidAllocations as e:
            # Handle any allocation validation error.
            # Log errors and restore current to generate proper metrics.
            log.error('Invalid allocations: %s', str(e))
            log.warning('Ignoring all allocations in this iteration due to validation error!')
            self._allocations_errors += 1
            target_allocations_values = current_allocations_values

        # Handle allocations: perform allocations based on changeset.
        if allocations_changeset_values:
            log.debug('Allocations changeset: %s', allocations_changeset_values)
            log.info('Performing allocations on %d tasks.', len(
                allocations_changeset_values))
            allocations_changeset_values.perform_allocations()

        # Prepare anomaly metrics.
        anomaly_metrics = convert_anomalies_to_metrics(anomalies, tasks_labels)
        update_anomalies_metrics_with_task_information(anomaly_metrics, tasks_labels)

        # Store anomalies information
        anomalies_package = MetricPackage(self._anomalies_storage)
        anomalies_package.add_metrics(
            anomaly_metrics,
            extra_metrics,
            self._anomalies_statistics.get_metrics(anomalies)
        )
        anomalies_package.send(common_labels)

        # Prepare allocations metrics.
        allocations_metrics = target_allocations_values.generate_metrics()
        allocations_statistic_metrics = _get_allocations_statistics_metrics(
            self._allocations_counter, self._allocations_errors, allocate_duration)

        # Store allocations metrics.
        allocations_package = MetricPackage(self._allocations_storage)
        allocations_package.add_metrics(
            allocations_metrics,
            extra_metrics,
            allocations_statistic_metrics,
        )
        allocations_package.send(common_labels)
コード例 #4
0
class AllocationRunner(Runner):
    """rst
    Runner is responsible for getting information about tasks from node,
    calling allocate() callback on allocator, performing returning allocations
    and storing all allocation related metrics in allocations_storage.

    Because Allocator interface is also detector, we store serialized detected anomalies
    in anomalies_storage and all other measurements in metrics_storage.


    - ``measurement_runner``: **MeasurementRunner**

        Measurement runner object.

    - ``allocator``: **Allocator**

        Component that provides allocation logic.

    - ``anomalies_storage``: **Storage** = `DEFAULT_STORAGE`

        Storage to store serialized anomalies and extra metrics.

    - ``allocations_storage``: **tdwiboolype** = `DEFAULT_STORAGE`

        Storage to store serialized resource allocations.

    - ``rdt_mb_control_required``: **bool** = *False*

        Indicates that MB control is required,
        if the platform does not support this feature the WCA will exit.

    - ``rdt_cache_control_required``: **bool** = *False*

        Indicates tha L3 control is required,
        if the platform does not support this feature the WCA will exit.

    - ``remove_all_resctrl_groups``: **bool** = *False*

        Remove all RDT controls groups upon starting.
    """
    def __init__(self,
                 measurement_runner: MeasurementRunner,
                 allocator: Allocator,
                 allocations_storage: Storage = DEFAULT_STORAGE,
                 anomalies_storage: Storage = DEFAULT_STORAGE,
                 rdt_mb_control_required: bool = False,
                 rdt_cache_control_required: bool = False,
                 remove_all_resctrl_groups: bool = False):

        if not measurement_runner._allocation_configuration:
            measurement_runner._allocation_configuration = AllocationConfiguration(
            )

        self._measurement_runner = measurement_runner

        # Allocation specific.
        self._allocator = allocator
        self._allocations_storage = allocations_storage

        self._rdt_mb_control_required = rdt_mb_control_required
        self._rdt_cache_control_required = rdt_cache_control_required

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()

        # Internal allocation statistics
        self._allocations_counter = 0
        self._allocations_errors = 0

        self._remove_all_resctrl_groups = remove_all_resctrl_groups

        self._measurement_runner._set_iterate_body_callback(self._iterate_body)
        self._measurement_runner._set_initialize_rdt_callback(
            self._initialize_rdt)

    def run(self) -> int:
        self._measurement_runner.run()

    def _initialize_rdt(self) -> bool:
        platform, _, _ = platforms.collect_platform_information()

        # Cache control check.
        if self._rdt_cache_control_required and \
                not platform.rdt_information.rdt_cache_control_enabled:
            # Wanted unavailable feature - halt
            log.error(
                'RDT cache control enabled but is not supported by platform!')
            return False

        # MB control check.
        if self._rdt_mb_control_required and \
                not platform.rdt_information.rdt_mb_control_enabled:
            # Some wanted unavailable feature - halt.
            log.error('RDT memory bandwidth enabled but '
                      'allocation is not supported by platform!')
            return False

        # Prepare initial values for L3, MB...
        root_rdt_l3, root_rdt_mb = resctrl.get_max_rdt_values(
            platform.rdt_information.cbm_mask, platform.sockets,
            platform.rdt_information.rdt_mb_control_enabled,
            platform.rdt_information.rdt_cache_control_enabled)

        # ...override max values with values from allocation configuration
        if self._measurement_runner._allocation_configuration.default_rdt_l3 is not None and \
                platform.rdt_information.rdt_cache_control_enabled:
            root_rdt_l3 = self._measurement_runner._allocation_configuration.default_rdt_l3
        if self._measurement_runner._allocation_configuration.default_rdt_mb is not None and \
                platform.rdt_information.rdt_mb_control_enabled:
            root_rdt_mb = self._measurement_runner._allocation_configuration.default_rdt_mb

        try:
            if root_rdt_l3 is not None:
                validate_l3_string(root_rdt_l3, platform.sockets,
                                   platform.rdt_information.cbm_mask,
                                   platform.rdt_information.min_cbm_bits)

            if root_rdt_mb is not None:
                normalized_root_rdt_mb = normalize_mb_string(
                    root_rdt_mb, platform.sockets,
                    platform.rdt_information.mb_min_bandwidth,
                    platform.rdt_information.mb_bandwidth_gran)
                resctrl.cleanup_resctrl(root_rdt_l3, normalized_root_rdt_mb,
                                        self._remove_all_resctrl_groups)
            else:
                resctrl.cleanup_resctrl(root_rdt_l3, root_rdt_mb,
                                        self._remove_all_resctrl_groups)
        except InvalidAllocations as e:
            log.error('Cannot initialize RDT subsystem: %s', e)
            return False

        return True

    def _iterate_body(self, containers: Dict[Task, ContainerInterface],
                      platform: platforms.Platform, tasks_data: TasksData,
                      common_labels):
        """Allocator callback body."""

        current_allocations = _get_tasks_allocations(containers)

        _update_tasks_data_with_allocations(tasks_data, current_allocations)

        # Allocator callback
        allocate_start = time.time()
        new_allocations, anomalies, extra_metrics = self._allocator.allocate(
            platform, tasks_data)
        allocate_duration = time.time() - allocate_start

        # Validate callback output
        _validate_allocate_return_vals(new_allocations, anomalies,
                                       extra_metrics)

        log.debug('Anomalies detected: %d', len(anomalies))
        log.debug('Current allocations: %s', current_allocations)

        # Create context aware allocations objects for current allocations.
        current_allocations_values = TasksAllocationsValues.create(
            self._measurement_runner._rdt_enabled, current_allocations,
            self._measurement_runner._containers_manager.containers, platform)

        # Handle allocations: calculate changeset and target allocations.
        allocations_changeset_values = None
        target_allocations_values = current_allocations_values
        try:
            # Special validation step needed for Kubernetes.
            validate_shares_allocation_for_kubernetes(
                tasks=containers.keys(), allocations=new_allocations)

            # Create and validate context aware allocations objects for new allocations.
            log.debug('New allocations: %s', new_allocations)
            new_allocations_values = TasksAllocationsValues.create(
                self._measurement_runner._rdt_enabled, new_allocations,
                self._measurement_runner._containers_manager.containers,
                platform)
            new_allocations_values.validate()

            # Calculate changeset and target_allocations.
            target_allocations_values, allocations_changeset_values = \
                new_allocations_values.calculate_changeset(current_allocations_values)
            target_allocations_values.validate()

            self._allocations_counter += len(new_allocations)

        except InvalidAllocations as e:
            # Handle any allocation validation error.
            # Log errors and restore current to generate proper metrics.
            log.error('Invalid allocations: %s', str(e))
            log.warning(
                'Ignoring all allocations in this iteration due to validation error!'
            )
            self._allocations_errors += 1
            target_allocations_values = current_allocations_values

        # Handle allocations: perform allocations based on changeset.
        if allocations_changeset_values:
            log.debug('Allocations changeset: %s',
                      allocations_changeset_values)
            log.info('Performing allocations on %d tasks.',
                     len(allocations_changeset_values))
            allocations_changeset_values.perform_allocations()

        # Prepare anomaly metrics.
        anomaly_metrics = convert_anomalies_to_metrics(anomalies, tasks_data)
        update_anomalies_metrics_with_task_information(anomaly_metrics,
                                                       tasks_data)

        # Store anomalies information
        anomalies_package = MetricPackage(self._anomalies_storage)
        anomalies_package.add_metrics(
            anomaly_metrics, extra_metrics,
            self._anomalies_statistics.get_metrics(anomalies))
        anomalies_package.send(common_labels)

        # Prepare allocations metrics.
        allocations_metrics = target_allocations_values.generate_metrics()
        allocations_statistic_metrics = _get_allocations_statistics_metrics(
            self._allocations_counter, self._allocations_errors,
            allocate_duration)

        # Store allocations metrics.
        allocations_package = MetricPackage(self._allocations_storage)
        allocations_package.add_metrics(
            allocations_metrics,
            extra_metrics,
            allocations_statistic_metrics,
        )
        allocations_package.send(common_labels)