def _process_measurements(self, tasks_measurements: TasksMeasurements, tasks_labels: TasksLabels, metric_list: List[wcaMetric], timestamp: float, assigned_cpus: float): sysutil = 0 lcutil = 0 for cid, measurements in tasks_measurements.items(): app = self._cid_to_app(cid, tasks_labels) container = self._get_container_from_taskid(cid) container.update_measurement(timestamp, measurements, self.agg) if not self._is_be_app(cid, tasks_labels): lcutil += container.util sysutil += container.util if self.agg: metrics = container.get_metrics() log.debug('cid=%r container metrics=%r', cid, metrics) if metrics: wca_metrics = container.get_wca_metrics(app) metric_list.extend(wca_metrics) if self.mode_config == ContentionDetector.COLLECT_MODE: app = self._cid_to_app(cid, tasks_labels) if app: self._record_metrics(timestamp, cid, app, metrics) if self.mode_config == ContentionDetector.COLLECT_MODE: self._record_utils(timestamp, lcutil) elif self.mode_config == ContentionDetector.DETECT_MODE: metric_list.extend( self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
def _process_measurements(self, tasks_measurements: TasksMeasurements, tasks_labels: TasksLabels, metric_list: List[WCAMetric], timestamp: float, assigned_cpus: float, cpu_model: str): sysutil = 0 lcutil = 0 for cid, measurements in tasks_measurements.items(): app = self._cid_to_app(cid, tasks_labels) container = self._get_container_from_taskid(cid) container.update_measurement(timestamp, measurements, self.agg) if not self._is_be_app(cid, tasks_labels): lcutil += container.util sysutil += container.util if self.agg: metrics = container.get_metrics() log.debug('cid=%r container metrics=%r', cid, metrics) if metrics: vcpus = self.workload_meta[app]['cpus'] wca_metrics = container.get_wca_metrics(app, vcpus) metric_list.extend(wca_metrics) app = self._cid_to_app(cid, tasks_labels) if app: # always try to init header column considering log rotate self._init_data_file(self.metric_file, self.mcols) self._record_metrics(timestamp, cid, app, correct_key_characters(cpu_model), vcpus, metrics) metric_list.extend( self._get_headroom_metrics(assigned_cpus, lcutil, sysutil))
def _build_tasks_metrics(tasks_labels: TasksLabels, tasks_measurements: TasksMeasurements) -> List[Metric]: tasks_metrics: List[Metric] = [] for task_id, task_measurements in tasks_measurements.items(): task_metrics = create_metrics(task_measurements) # Decorate metrics with task specific labels. for task_metric in task_metrics: task_metric.labels.update(tasks_labels[task_id]) tasks_metrics += task_metrics return tasks_metrics
def _process_measurements( self, tasks_measurements: TasksMeasurements, tasks_labels: TasksLabels, metric_list: List[WCAMetric], timestamp: float, assigned_cpus: float, cpu_model: str ): sysutil = 0 lcutil = 0 beutil = 0 for cid, measurements in tasks_measurements.items(): app = self._cid_to_app(cid, tasks_labels) if cid in self.container_map: container = self.container_map[cid] else: container = Container(cid) self.container_map[cid] = container if self.enable_control: if cid in self.bes: self.cpuc.set_share(cid, 0.0) self.cpuc.budgeting([cid], []) self.l3c.budgeting([cid], []) if self.mbc_enabled: self.mbc.budgeting([cid], []) else: self.cpuc.set_share(cid, 1.0) if self.exclusive_cat: self.l3c.budgeting([], [cid]) container.update_measurement(timestamp, measurements, self.agg) if cid not in self.bes: lcutil += container.util else: beutil += container.util sysutil += container.util if self.agg: metrics = container.get_metrics() log.debug('cid=%r container metrics=%r', cid, metrics) if metrics and app: vcpus = self.workload_meta[app]['cpus'] wca_metrics = container.get_wca_metrics(app, vcpus) metric_list.extend(wca_metrics) # always try to init header column considering log rotate self._init_data_file(self.metric_file, self.mcols) self._record_metrics(timestamp, cid, app, correct_key_characters(cpu_model), vcpus, metrics) metric_list.extend(self._get_headroom_metrics(assigned_cpus, lcutil, sysutil)) if self.enable_control and self.bes: exceed, hold = self.cpuc.detect_margin_exceed(lcutil, beutil) self.controllers[ContendedResource.CPUS].update(self.bes, [], exceed, hold)
def _process_measurements(self, tasks_measurements: TasksMeasurements, tasks_labels: TasksLabels, metric_list: List[WCAMetric], timestamp: float, assigned_cpus: float): sysutil = 0 lcutil = 0 beutil = 0 for cid, measurements in tasks_measurements.items(): app = self._cid_to_app(cid, tasks_labels) if cid in self.container_map: container = self.container_map[cid] else: container = Container(cid) self.container_map[cid] = container if self.mode_config == ResourceAllocator.DETECT_MODE: if cid in self.bes: self.cpuc.set_share(cid, 0.0) self.cpuc.budgeting([cid], []) self.l3c.budgeting([cid], []) if self.mbc_enabled: self.mbc.budgeting([cid], []) else: self.cpuc.set_share(cid, 1.0) if self.exclusive_cat: self.l3c.budgeting([], [cid]) container.update_measurement(timestamp, measurements, self.agg) if cid not in self.bes: lcutil += container.util else: beutil += container.util sysutil += container.util if self.agg: metrics = container.get_metrics() log.debug('cid=%r container metrics=%r', cid, metrics) if metrics: wca_metrics = container.get_wca_metrics(app) metric_list.extend(wca_metrics) if self.mode_config == ResourceAllocator.COLLECT_MODE: app = self._cid_to_app(cid, tasks_labels) if app: self._record_metrics(timestamp, cid, app, metrics) if self.mode_config == ResourceAllocator.COLLECT_MODE: self._record_utils(timestamp, lcutil) elif self.mode_config == ResourceAllocator.DETECT_MODE: metric_list.extend( self._get_headroom_metrics(assigned_cpus, lcutil, sysutil)) if self.bes: exceed, hold = self.cpuc.detect_margin_exceed(lcutil, beutil) self.controllers[ContendedResource.CPUS].update( self.bes, [], exceed, hold)
def detect(self, platform: Platform, tasks_measurements: TasksMeasurements, tasks_resources: TasksResources, tasks_labels: TasksLabels): anomalies = [] # Based on hostname generate skew of phase for different hosts, # to simulate contention alerting firing from multiple sources at different time. if self.skew: phase_skew = sum( hashlib.sha256(socket.gethostname().encode('UTF-8')).digest()) else: phase_skew = 0 # Find out moment of cycle. second_of_cycle = int(time.time() + phase_skew) % self.cycle_length # Make sure we have enough tasks (to simulate contention). if len(tasks_measurements) >= 10: resources = [ ContendedResource.CPUS, ContendedResource.LLC, ContendedResource.MEMORY_BW, ] # Define phases of simulation. if second_of_cycle < 10: # Single contention on one resource with single contender task. tasks_count = 1 resources_count = 1 metrics_count = 1 elif second_of_cycle < 20: # Single contention on two resources with single contender task # (with two additional metrics). tasks_count = 1 resources_count = 2 metrics_count = 2 elif second_of_cycle < 30: # Single contention on three resources with two contender tasks # (with two additional metrics each). tasks_count = 1 resources_count = 3 metrics_count = 2 elif second_of_cycle < 40: # Two contentions each on two resources with two contender tasks # (with two additional metrics each). tasks_count = 2 resources_count = 2 metrics_count = 3 elif second_of_cycle < 50: # Multiple (three) contentions each on single resource with single contender task # (with two additional metrics each). tasks_count = 3 resources_count = 1 metrics_count = 1 else: # Contention free period. resources_count = 0 tasks_count = 0 metrics_count = 0 log.info('detector simulation: tasks=%d resources=%d metrics=%d!', tasks_count, resources_count, metrics_count) # Make sure that we choose tasks pairs for generating faked contention. task_ids = sorted(tasks_measurements.keys()) # Predefined pairs of contended and contending tasks. task_pairs = [ (task_ids[0], task_ids[1:3]), # 0 vs 1,2 (task_ids[3], task_ids[4:5]), # 3 vs 4 (task_ids[6], task_ids[7:10]), # 6 vs 7,8,9 ] # Generate multiple contention based on scenario phase. for resource_idx in range(resources_count): for task_pair_idx in range(tasks_count): contended_task_id, contending_task_ids = task_pairs[ task_pair_idx] resource = resources[resource_idx] metrics = [ Metric(name="cpu_threshold_%d" % i, value="%d" % (i + 1) * 10, type="gauge") for i in range(metrics_count) ] anomalies.append( ContentionAnomaly( contended_task_id=contended_task_id, contending_task_ids=contending_task_ids, resource=resource, metrics=metrics, )) else: log.warning('not enough tasks %d to simulate contention!', len(tasks_measurements)) debugging_metrics = [ Metric( name='second_of_cycle', value=second_of_cycle, type="gauge", ) ] return anomalies, debugging_metrics