def allocate( self, platform: Platform, tasks_data: TasksData, ) -> (TasksAllocations, List[Anomaly], List[Metric]): rules = [] if self.rules: rules.extend(self.rules) if self.config: if not os.path.exists(self.config): log.warning( 'StaticAllocator: cannot find config file %r - ignoring!', self.config) else: rules.extend(load_config(self.config)) if len(rules) == 0: log.warning('StaticAllocator: no rules were provided!') return {}, [], [] log.log(TRACE, 'StaticAllocator: handling allocations for %i tasks. ', len(tasks_data)) for task, data in tasks_data.items(): log.log( TRACE, '%s', ' '.join('%s=%s' % (k, v) for k, v in sorted(data.labels.items()))) tasks_allocations = _build_allocations_from_rules(tasks_data, rules) log.debug('StaticAllocator: final tasks allocations: \n %s', pprint.pformat(tasks_allocations)) return tasks_allocations, [], []
def _build_tasks_memory(tasks_data: TasksData, platform: Platform) -> TasksMemory: total_memory = _get_platform_total_memory(platform) tasks_memory = [] for task, data in tasks_data.items(): tasks_memory.append( (task, _get_task_memory_limit(data.measurements, total_memory, task, data.resources), _get_numa_node_preferences(data.measurements, platform.numa_nodes))) return sorted(tasks_memory, reverse=True, key=lambda x: x[1])
def _build_tasks_metrics(tasks_data: TasksData) -> List[Metric]: """Build metrics for all tasks.""" tasks_metrics: List[Metric] = [] for task, data in tasks_data.items(): task_metrics = export_metrics_from_measurements(data.measurements) # Decorate metrics with task specific labels. for task_metric in task_metrics: task_metric.labels.update(data.labels) tasks_metrics += task_metrics return tasks_metrics
def _update_tasks_data_with_allocations(tasks_data: TasksData, current_allocations: TasksAllocations): for task, data in tasks_data.items(): if task in current_allocations: # no need to make deep copy, as only one level and unmutable types as leafs data.allocations = dict(current_allocations[task])
def _build_allocations_from_rules(tasks_data: TasksData, rules): tasks_allocations = {} # Iterate over rules and apply one by one. for rule_idx, rule in enumerate(rules): if 'allocations' not in rule: log.warning('StaticAllocator(%s): missing "allocations" - ignore!', rule_idx) continue log.debug('StaticAllocator(%s): processing %s rule.', rule_idx, '(%s)' % rule['name'] if 'name' in rule else '') new_task_allocations = rule['allocations'] if not new_task_allocations: log.log(TRACE, 'StaticAllocator(%s): allocations are empty - ignore!', rule_idx) continue # Convert if nessesary. if 'rdt' in new_task_allocations and isinstance( new_task_allocations['rdt'], dict): new_task_allocations[AllocationType.RDT] = RDTAllocation( **new_task_allocations['rdt']) # Prepare match_task_ids filter: if 'task_id' in rule: # by task_id task_id = rule['task_id'] match_task_ids = {task_id} log.log(TRACE, 'StaticAllocator(%s): match by task_id=%r', rule_idx, rule['task_id']) # Find all tasks that matches. elif 'labels' in rule: labels = rule['labels'] # by labels match_task_ids = set() for task, data in tasks_data.items(): matching_label_names = set(data.labels) & set(labels) for label_name in matching_label_names: if re.match(str(labels[label_name]), data.labels[label_name]): match_task_ids.add(task) log.log( TRACE, 'StaticAllocator(%s): match task %r by label=%s', rule_idx, task_id, label_name) else: # match everything log.log(TRACE, 'StaticAllocator(%s): match all tasks', rule_idx) match_task_ids = tasks_data.keys() # for matching tasks calculate and remember target_tasks_allocations log.log(TRACE, 'StaticAllocator(%s): applying allocations for %i tasks', rule_idx, len(match_task_ids)) rule_tasks_allocations = {} # Set rules for every matching task. for match_task_id in match_task_ids: rule_tasks_allocations[match_task_id] = new_task_allocations # Merge rules with previous rules. tasks_allocations = merge_rules(tasks_allocations, rule_tasks_allocations) return tasks_allocations
def detect(self, platform: Platform, tasks_data: TasksData): anomalies = [] # Based on hostname generate skew of phase for different hosts, # to simulate contention alerting firing from multiple sources at different time. if self.skew: phase_skew = sum( hashlib.sha256(socket.gethostname().encode('UTF-8')).digest()) else: phase_skew = 0 # Find out moment of cycle. second_of_cycle = int(time.time() + phase_skew) % self.cycle_length # Make sure we have enough tasks (to simulate contention). if len(tasks_data) >= 10: resources = [ ContendedResource.CPUS, ContendedResource.LLC, ContendedResource.MEMORY_BW, ] # Define phases of simulation. if second_of_cycle < 10: # Single contention on one resource with single contender task. tasks_count = 1 resources_count = 1 metrics_count = 1 elif second_of_cycle < 20: # Single contention on two resources with single contender task # (with two additional metrics). tasks_count = 1 resources_count = 2 metrics_count = 2 elif second_of_cycle < 30: # Single contention on three resources with two contender tasks # (with two additional metrics each). tasks_count = 1 resources_count = 3 metrics_count = 2 elif second_of_cycle < 40: # Two contentions each on two resources with two contender tasks # (with two additional metrics each). tasks_count = 2 resources_count = 2 metrics_count = 3 elif second_of_cycle < 50: # Multiple (three) contentions each on single resource with single contender task # (with two additional metrics each). tasks_count = 3 resources_count = 1 metrics_count = 1 else: # Contention free period. resources_count = 0 tasks_count = 0 metrics_count = 0 log.info('detector simulation: tasks=%d resources=%d metrics=%d!', tasks_count, resources_count, metrics_count) # Make sure that we choose tasks pairs for generating faked contention. task_ids = sorted(tasks_data.keys()) # Predefined pairs of contended and contending tasks. task_pairs = [ (task_ids[0], task_ids[1:3]), # 0 vs 1,2 (task_ids[3], task_ids[4:5]), # 3 vs 4 (task_ids[6], task_ids[7:10]), # 6 vs 7,8,9 ] # Generate multiple contention based on scenario phase. for resource_idx in range(resources_count): for task_pair_idx in range(tasks_count): contended_task_id, contending_task_ids = task_pairs[ task_pair_idx] resource = resources[resource_idx] metrics = [ Metric(name="cpu_threshold_%d" % i, value="%d" % (i + 1) * 10, type="gauge") for i in range(metrics_count) ] anomalies.append( ContentionAnomaly( contended_task_id=contended_task_id, contending_task_ids=contending_task_ids, resource=resource, metrics=metrics, )) else: log.warning('not enough tasks %d to simulate contention!', len(tasks_data)) debugging_metrics = [ Metric( name='second_of_cycle', value=second_of_cycle, type="gauge", ) ] return anomalies, debugging_metrics