Beispiel #1
0
    def allocate(
        self,
        platform: Platform,
        tasks_data: TasksData,
    ) -> (TasksAllocations, List[Anomaly], List[Metric]):

        rules = []
        if self.rules:
            rules.extend(self.rules)

        if self.config:
            if not os.path.exists(self.config):
                log.warning(
                    'StaticAllocator: cannot find config file %r - ignoring!',
                    self.config)
            else:
                rules.extend(load_config(self.config))

        if len(rules) == 0:
            log.warning('StaticAllocator: no rules were provided!')
            return {}, [], []

        log.log(TRACE, 'StaticAllocator: handling allocations for %i tasks. ',
                len(tasks_data))
        for task, data in tasks_data.items():
            log.log(
                TRACE, '%s', ' '.join('%s=%s' % (k, v)
                                      for k, v in sorted(data.labels.items())))

        tasks_allocations = _build_allocations_from_rules(tasks_data, rules)

        log.debug('StaticAllocator: final tasks allocations: \n %s',
                  pprint.pformat(tasks_allocations))
        return tasks_allocations, [], []
Beispiel #2
0
def _build_tasks_memory(tasks_data: TasksData,
                        platform: Platform) -> TasksMemory:
    total_memory = _get_platform_total_memory(platform)

    tasks_memory = []
    for task, data in tasks_data.items():
        tasks_memory.append(
            (task,
             _get_task_memory_limit(data.measurements, total_memory, task,
                                    data.resources),
             _get_numa_node_preferences(data.measurements,
                                        platform.numa_nodes)))
    return sorted(tasks_memory, reverse=True, key=lambda x: x[1])
def _build_tasks_metrics(tasks_data: TasksData) -> List[Metric]:
    """Build metrics for all tasks."""
    tasks_metrics: List[Metric] = []

    for task, data in tasks_data.items():
        task_metrics = export_metrics_from_measurements(data.measurements)

        # Decorate metrics with task specific labels.
        for task_metric in task_metrics:
            task_metric.labels.update(data.labels)

        tasks_metrics += task_metrics

    return tasks_metrics
def _update_tasks_data_with_allocations(tasks_data: TasksData,
                                        current_allocations: TasksAllocations):
    for task, data in tasks_data.items():
        if task in current_allocations:
            # no need to make deep copy, as only one level and unmutable types as leafs
            data.allocations = dict(current_allocations[task])
Beispiel #5
0
def _build_allocations_from_rules(tasks_data: TasksData, rules):
    tasks_allocations = {}

    # Iterate over rules and apply one by one.
    for rule_idx, rule in enumerate(rules):
        if 'allocations' not in rule:
            log.warning('StaticAllocator(%s): missing "allocations" - ignore!',
                        rule_idx)
            continue

        log.debug('StaticAllocator(%s): processing %s rule.', rule_idx,
                  '(%s)' % rule['name'] if 'name' in rule else '')

        new_task_allocations = rule['allocations']
        if not new_task_allocations:
            log.log(TRACE,
                    'StaticAllocator(%s): allocations are empty - ignore!',
                    rule_idx)
            continue

        # Convert if nessesary.
        if 'rdt' in new_task_allocations and isinstance(
                new_task_allocations['rdt'], dict):
            new_task_allocations[AllocationType.RDT] = RDTAllocation(
                **new_task_allocations['rdt'])

        # Prepare match_task_ids filter:
        if 'task_id' in rule:
            # by task_id
            task_id = rule['task_id']
            match_task_ids = {task_id}
            log.log(TRACE, 'StaticAllocator(%s): match by task_id=%r',
                    rule_idx, rule['task_id'])

        # Find all tasks that matches.
        elif 'labels' in rule:
            labels = rule['labels']
            # by labels
            match_task_ids = set()
            for task, data in tasks_data.items():
                matching_label_names = set(data.labels) & set(labels)
                for label_name in matching_label_names:
                    if re.match(str(labels[label_name]),
                                data.labels[label_name]):
                        match_task_ids.add(task)
                        log.log(
                            TRACE,
                            'StaticAllocator(%s):  match task %r by label=%s',
                            rule_idx, task_id, label_name)
        else:
            # match everything
            log.log(TRACE, 'StaticAllocator(%s):  match all tasks', rule_idx)
            match_task_ids = tasks_data.keys()

        # for matching tasks calculate and remember target_tasks_allocations
        log.log(TRACE,
                'StaticAllocator(%s):  applying allocations for %i tasks',
                rule_idx, len(match_task_ids))

        rule_tasks_allocations = {}

        # Set rules for every matching task.
        for match_task_id in match_task_ids:
            rule_tasks_allocations[match_task_id] = new_task_allocations

        # Merge rules with previous rules.
        tasks_allocations = merge_rules(tasks_allocations,
                                        rule_tasks_allocations)

    return tasks_allocations
Beispiel #6
0
    def detect(self, platform: Platform, tasks_data: TasksData):

        anomalies = []

        # Based on hostname generate skew of phase for different hosts,
        # to simulate contention alerting firing from multiple sources at different time.
        if self.skew:
            phase_skew = sum(
                hashlib.sha256(socket.gethostname().encode('UTF-8')).digest())
        else:
            phase_skew = 0

        # Find out moment of cycle.
        second_of_cycle = int(time.time() + phase_skew) % self.cycle_length

        # Make sure we have enough tasks (to simulate contention).
        if len(tasks_data) >= 10:

            resources = [
                ContendedResource.CPUS,
                ContendedResource.LLC,
                ContendedResource.MEMORY_BW,
            ]

            # Define phases of simulation.
            if second_of_cycle < 10:
                # Single contention on one resource with single contender task.
                tasks_count = 1
                resources_count = 1
                metrics_count = 1
            elif second_of_cycle < 20:
                # Single contention on two resources with single contender task
                # (with two additional metrics).
                tasks_count = 1
                resources_count = 2
                metrics_count = 2
            elif second_of_cycle < 30:
                # Single contention on three resources with two contender tasks
                # (with two additional metrics each).
                tasks_count = 1
                resources_count = 3
                metrics_count = 2
            elif second_of_cycle < 40:
                # Two contentions each on two resources with two contender tasks
                # (with two additional metrics each).
                tasks_count = 2
                resources_count = 2
                metrics_count = 3
            elif second_of_cycle < 50:
                # Multiple (three) contentions each on single resource with single contender task
                # (with two additional metrics each).
                tasks_count = 3
                resources_count = 1
                metrics_count = 1
            else:
                # Contention free period.
                resources_count = 0
                tasks_count = 0
                metrics_count = 0

            log.info('detector simulation: tasks=%d resources=%d metrics=%d!',
                     tasks_count, resources_count, metrics_count)

            # Make sure that we choose tasks pairs for generating faked contention.
            task_ids = sorted(tasks_data.keys())

            # Predefined pairs of contended and contending tasks.
            task_pairs = [
                (task_ids[0], task_ids[1:3]),  # 0 vs 1,2
                (task_ids[3], task_ids[4:5]),  # 3 vs 4
                (task_ids[6], task_ids[7:10]),  # 6 vs 7,8,9
            ]

            # Generate multiple contention based on scenario phase.
            for resource_idx in range(resources_count):
                for task_pair_idx in range(tasks_count):

                    contended_task_id, contending_task_ids = task_pairs[
                        task_pair_idx]
                    resource = resources[resource_idx]
                    metrics = [
                        Metric(name="cpu_threshold_%d" % i,
                               value="%d" % (i + 1) * 10,
                               type="gauge") for i in range(metrics_count)
                    ]

                    anomalies.append(
                        ContentionAnomaly(
                            contended_task_id=contended_task_id,
                            contending_task_ids=contending_task_ids,
                            resource=resource,
                            metrics=metrics,
                        ))
        else:
            log.warning('not enough tasks %d to simulate contention!',
                        len(tasks_data))

        debugging_metrics = [
            Metric(
                name='second_of_cycle',
                value=second_of_cycle,
                type="gauge",
            )
        ]

        return anomalies, debugging_metrics