def _process_measurements(self, tasks_measurements: TasksMeasurements,
                              tasks_labels: TasksLabels,
                              metric_list: List[wcaMetric], timestamp: float,
                              assigned_cpus: float):

        sysutil = 0
        lcutil = 0
        for cid, measurements in tasks_measurements.items():
            app = self._cid_to_app(cid, tasks_labels)
            container = self._get_container_from_taskid(cid)
            container.update_measurement(timestamp, measurements, self.agg)
            if not self._is_be_app(cid, tasks_labels):
                lcutil += container.util
            sysutil += container.util
            if self.agg:
                metrics = container.get_metrics()
                log.debug('cid=%r container metrics=%r', cid, metrics)
                if metrics:
                    wca_metrics = container.get_wca_metrics(app)
                    metric_list.extend(wca_metrics)
                    if self.mode_config == ContentionDetector.COLLECT_MODE:
                        app = self._cid_to_app(cid, tasks_labels)
                        if app:
                            self._record_metrics(timestamp, cid, app, metrics)

        if self.mode_config == ContentionDetector.COLLECT_MODE:
            self._record_utils(timestamp, lcutil)
        elif self.mode_config == ContentionDetector.DETECT_MODE:
            metric_list.extend(
                self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
Beispiel #2
0
    def _process_measurements(self, tasks_measurements: TasksMeasurements,
                              tasks_labels: TasksLabels,
                              metric_list: List[WCAMetric], timestamp: float,
                              assigned_cpus: float, cpu_model: str):
        sysutil = 0
        lcutil = 0
        for cid, measurements in tasks_measurements.items():
            app = self._cid_to_app(cid, tasks_labels)
            container = self._get_container_from_taskid(cid)
            container.update_measurement(timestamp, measurements, self.agg)
            if not self._is_be_app(cid, tasks_labels):
                lcutil += container.util
            sysutil += container.util
            if self.agg:
                metrics = container.get_metrics()
                log.debug('cid=%r container metrics=%r', cid, metrics)
                if metrics:
                    vcpus = self.workload_meta[app]['cpus']
                    wca_metrics = container.get_wca_metrics(app, vcpus)
                    metric_list.extend(wca_metrics)
                    app = self._cid_to_app(cid, tasks_labels)
                    if app:
                        # always try to init header column considering log rotate
                        self._init_data_file(self.metric_file, self.mcols)
                        self._record_metrics(timestamp, cid, app,
                                             correct_key_characters(cpu_model),
                                             vcpus, metrics)

        metric_list.extend(
            self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
def _build_tasks_metrics(tasks_labels: TasksLabels,
                         tasks_measurements: TasksMeasurements) -> List[Metric]:
    tasks_metrics: List[Metric] = []

    for task_id, task_measurements in tasks_measurements.items():
        task_metrics = create_metrics(task_measurements)
        # Decorate metrics with task specific labels.
        for task_metric in task_metrics:
            task_metric.labels.update(tasks_labels[task_id])
        tasks_metrics += task_metrics
    return tasks_metrics
Beispiel #4
0
    def _process_measurements(
        self,
        tasks_measurements: TasksMeasurements,
        tasks_labels: TasksLabels,
        metric_list: List[WCAMetric],
        timestamp: float,
        assigned_cpus: float,
        cpu_model: str
    ):
        sysutil = 0
        lcutil = 0
        beutil = 0
        for cid, measurements in tasks_measurements.items():
            app = self._cid_to_app(cid, tasks_labels)
            if cid in self.container_map:
                container = self.container_map[cid]
            else:
                container = Container(cid)
                self.container_map[cid] = container
                if self.enable_control:
                    if cid in self.bes:
                        self.cpuc.set_share(cid, 0.0)
                        self.cpuc.budgeting([cid], [])
                        self.l3c.budgeting([cid], [])
                        if self.mbc_enabled:
                            self.mbc.budgeting([cid], [])
                    else:
                        self.cpuc.set_share(cid, 1.0)
                        if self.exclusive_cat:
                            self.l3c.budgeting([], [cid])

            container.update_measurement(timestamp, measurements, self.agg)
            if cid not in self.bes:
                lcutil += container.util
            else:
                beutil += container.util
            sysutil += container.util
            if self.agg:
                metrics = container.get_metrics()
                log.debug('cid=%r container metrics=%r', cid, metrics)
                if metrics and app:
                    vcpus = self.workload_meta[app]['cpus']
                    wca_metrics = container.get_wca_metrics(app, vcpus)
                    metric_list.extend(wca_metrics)
                    # always try to init header column considering log rotate
                    self._init_data_file(self.metric_file, self.mcols)
                    self._record_metrics(timestamp, cid, app,
                                         correct_key_characters(cpu_model),
                                         vcpus, metrics)

        metric_list.extend(self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
        if self.enable_control and self.bes:
            exceed, hold = self.cpuc.detect_margin_exceed(lcutil, beutil)
            self.controllers[ContendedResource.CPUS].update(self.bes, [], exceed, hold)
    def _process_measurements(self, tasks_measurements: TasksMeasurements,
                              tasks_labels: TasksLabels,
                              metric_list: List[WCAMetric], timestamp: float,
                              assigned_cpus: float):

        sysutil = 0
        lcutil = 0
        beutil = 0
        for cid, measurements in tasks_measurements.items():
            app = self._cid_to_app(cid, tasks_labels)
            if cid in self.container_map:
                container = self.container_map[cid]
            else:
                container = Container(cid)
                self.container_map[cid] = container
                if self.mode_config == ResourceAllocator.DETECT_MODE:
                    if cid in self.bes:
                        self.cpuc.set_share(cid, 0.0)
                        self.cpuc.budgeting([cid], [])
                        self.l3c.budgeting([cid], [])
                        if self.mbc_enabled:
                            self.mbc.budgeting([cid], [])
                    else:
                        self.cpuc.set_share(cid, 1.0)
                        if self.exclusive_cat:
                            self.l3c.budgeting([], [cid])

            container.update_measurement(timestamp, measurements, self.agg)
            if cid not in self.bes:
                lcutil += container.util
            else:
                beutil += container.util
            sysutil += container.util
            if self.agg:
                metrics = container.get_metrics()
                log.debug('cid=%r container metrics=%r', cid, metrics)
                if metrics:
                    wca_metrics = container.get_wca_metrics(app)
                    metric_list.extend(wca_metrics)
                    if self.mode_config == ResourceAllocator.COLLECT_MODE:
                        app = self._cid_to_app(cid, tasks_labels)
                        if app:
                            self._record_metrics(timestamp, cid, app, metrics)

        if self.mode_config == ResourceAllocator.COLLECT_MODE:
            self._record_utils(timestamp, lcutil)
        elif self.mode_config == ResourceAllocator.DETECT_MODE:
            metric_list.extend(
                self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
            if self.bes:
                exceed, hold = self.cpuc.detect_margin_exceed(lcutil, beutil)
                self.controllers[ContendedResource.CPUS].update(
                    self.bes, [], exceed, hold)
Beispiel #6
0
    def detect(self, platform: Platform, tasks_measurements: TasksMeasurements,
               tasks_resources: TasksResources, tasks_labels: TasksLabels):

        anomalies = []

        # Based on hostname generate skew of phase for different hosts,
        # to simulate contention alerting firing from multiple sources at different time.
        if self.skew:
            phase_skew = sum(
                hashlib.sha256(socket.gethostname().encode('UTF-8')).digest())
        else:
            phase_skew = 0

        # Find out moment of cycle.
        second_of_cycle = int(time.time() + phase_skew) % self.cycle_length

        # Make sure we have enough tasks (to simulate contention).
        if len(tasks_measurements) >= 10:

            resources = [
                ContendedResource.CPUS,
                ContendedResource.LLC,
                ContendedResource.MEMORY_BW,
            ]

            # Define phases of simulation.
            if second_of_cycle < 10:
                # Single contention on one resource with single contender task.
                tasks_count = 1
                resources_count = 1
                metrics_count = 1
            elif second_of_cycle < 20:
                # Single contention on two resources with single contender task
                # (with two additional metrics).
                tasks_count = 1
                resources_count = 2
                metrics_count = 2
            elif second_of_cycle < 30:
                # Single contention on three resources with two contender tasks
                # (with two additional metrics each).
                tasks_count = 1
                resources_count = 3
                metrics_count = 2
            elif second_of_cycle < 40:
                # Two contentions each on two resources with two contender tasks
                # (with two additional metrics each).
                tasks_count = 2
                resources_count = 2
                metrics_count = 3
            elif second_of_cycle < 50:
                # Multiple (three) contentions each on single resource with single contender task
                # (with two additional metrics each).
                tasks_count = 3
                resources_count = 1
                metrics_count = 1
            else:
                # Contention free period.
                resources_count = 0
                tasks_count = 0
                metrics_count = 0

            log.info('detector simulation: tasks=%d resources=%d metrics=%d!',
                     tasks_count, resources_count, metrics_count)

            # Make sure that we choose tasks pairs for generating faked contention.
            task_ids = sorted(tasks_measurements.keys())

            # Predefined pairs of contended and contending tasks.
            task_pairs = [
                (task_ids[0], task_ids[1:3]),  # 0 vs 1,2
                (task_ids[3], task_ids[4:5]),  # 3 vs 4
                (task_ids[6], task_ids[7:10]),  # 6 vs 7,8,9
            ]

            # Generate multiple contention based on scenario phase.
            for resource_idx in range(resources_count):
                for task_pair_idx in range(tasks_count):

                    contended_task_id, contending_task_ids = task_pairs[
                        task_pair_idx]
                    resource = resources[resource_idx]
                    metrics = [
                        Metric(name="cpu_threshold_%d" % i,
                               value="%d" % (i + 1) * 10,
                               type="gauge") for i in range(metrics_count)
                    ]

                    anomalies.append(
                        ContentionAnomaly(
                            contended_task_id=contended_task_id,
                            contending_task_ids=contending_task_ids,
                            resource=resource,
                            metrics=metrics,
                        ))
        else:
            log.warning('not enough tasks %d to simulate contention!',
                        len(tasks_measurements))

        debugging_metrics = [
            Metric(
                name='second_of_cycle',
                value=second_of_cycle,
                type="gauge",
            )
        ]

        return anomalies, debugging_metrics