Ejemplo n.º 1
0
    def get_measurements(self, mongroup_name, mb_monitoring_enabled,
                         cache_monitoring_enabled) -> Measurements:
        """
        mbm_total: Memory bandwidth - type: counter, unit: [bytes]
        :return: Dictionary containing memory bandwidth
        and cpu usage measurements
        """
        mbm_total = 0
        llc_occupancy = 0

        def _get_event_file(socket_dir, event_name):
            return os.path.join(self.fullpath, MON_GROUPS, mongroup_name,
                                MON_DATA, socket_dir, event_name)

        # Iterate over sockets to gather data:
        try:
            for socket_dir in os.listdir(os.path.join(self.fullpath,
                                                      MON_GROUPS, mongroup_name, MON_DATA)):
                if mb_monitoring_enabled:
                    with open(_get_event_file(socket_dir, MBM_TOTAL)) as mbm_total_file:
                        mbm_total += int(mbm_total_file.read())
                if cache_monitoring_enabled:
                    with open(_get_event_file(socket_dir, LLC_OCCUPANCY)) as llc_occupancy_file:
                        llc_occupancy += int(llc_occupancy_file.read())
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Measurement unavailable.'.format(
                    e.filename))

        measurements = {}
        if mb_monitoring_enabled:
            measurements[MetricName.MEM_BW] = mbm_total
        if cache_monitoring_enabled:
            measurements[MetricName.LLC_OCCUPANCY] = llc_occupancy
        return measurements
Ejemplo n.º 2
0
    def reset_counters(self):
        """Reset counters managed by cgroup abstraction.

        After one of the container from Pod restarts, the cgroup is reused, but
        cpuacct.usage still holds a value from previus unsucessful runs.
        For multicontainer Pods there is a problem when long lived sum of counters
        from all containers is slightly decreased, because any decrease in counter
        is treated by Prometheus as reset and then it is assumed that "total value" is
        last period decrese causing unrealistics spikes in counter rate/increase.
        There are two solitions:
        1. Make sure that after we reintialize cgroups, we will reset all the counters
           (This solution)
        2. Do not aggregate counters (by summing), but expose them as as additional time
           series (new metrics per container)
           (To be considered, but requires API change (new levels) for some metrics)
        ps. Cgroups solutions works because all other metrics:
        are gauages (NUMA),  collected per POD (RDT) or properly reset to 0 (perf counters)
        """
        try:
            with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE), 'w') as \
                    cpu_usage_file:
                cpu_usage_file.write('0')
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Cpu usage (to perfom reset) unavailable.'.
                format(e.filename))
Ejemplo n.º 3
0
def _get_cgroup_fd(cgroup) -> int:
    """
    Return FD for provided cgroup
    """
    path = os.path.join('/sys/fs/cgroup/perf_event', cgroup)
    # cgroup is a directory, so we can't use fdopen on the file
    # descriptor we receive from os.open
    try:
        return os.open(path, os.O_RDONLY)
    except FileNotFoundError:
        raise MissingMeasurementException(
            'cannot initialize perf for cgroup %r - directory not found' % cgroup)
Ejemplo n.º 4
0
    def get_measurements(self) -> Measurements:
        try:
            with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE)) as \
                    cpu_usage_file:
                cpu_usage = int(cpu_usage_file.read())
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Cpu usage unavailable.'.format(e.filename))

        measurements = {MetricName.CPU_USAGE_PER_TASK: cpu_usage}

        try:
            with open(os.path.join(self.cgroup_memory_fullpath,
                                   CgroupResource.MEMORY_USAGE)) as memory_usage_file:
                memory_usage = int(memory_usage_file.read())
            measurements[MetricName.MEM_USAGE_PER_TASK] = memory_usage
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Memory usage unavailable.'.format(e.filename))

        return measurements
Ejemplo n.º 5
0
    def get_measurements(self, mongroup_name, mb_monitoring_enabled,
                         cache_monitoring_enabled) -> Measurements:
        """
        mbm_total: Memory bandwidth - type: counter, unit: [bytes]

        mbm_local: Local memory bandiwdth - type: counter, unit: [bytes]

        mbm_remote: Remote memory bandwidth - type: counter, unit: [bytes]
        :return: Dictionary containing memory bandwidth
        and cpu usage measurements
        """
        mbm_total = 0
        mbm_local = 0
        llc_occupancy = 0

        # Iterate over sockets to gather data:
        try:
            for socket_dir in self.get_socket_dirs(mongroup_name):
                if mb_monitoring_enabled:
                    with open(
                            self.get_event_file(
                                socket_dir, MBM_TOTAL,
                                mongroup_name)) as mbm_total_file:
                        mbm_total += int(mbm_total_file.read())
                    with open(
                            self.get_event_file(
                                socket_dir, MBM_LOCAL,
                                mongroup_name)) as mbm_local_file:
                        mbm_local += int(mbm_local_file.read())

                if cache_monitoring_enabled:
                    with open(
                            self.get_event_file(
                                socket_dir, LLC_OCCUPANCY,
                                mongroup_name)) as llc_occupancy_file:
                        llc_occupancy += int(llc_occupancy_file.read())
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Measurement unavailable.'.format(
                    e.filename))

        measurements = {}
        if mb_monitoring_enabled:
            measurements[MetricName.TASK_MEM_BANDWIDTH_BYTES] = mbm_total
            measurements[MetricName.TASK_MEM_BANDWIDTH_LOCAL_BYTES] = mbm_local
            measurements[
                MetricName.
                TASK_MEM_BANDWIDTH_REMOTE_BYTES] = mbm_total - mbm_local
        if cache_monitoring_enabled:
            measurements[MetricName.TASK_LLC_OCCUPANCY_BYTES] = llc_occupancy
        return measurements
        containers)

    assert tasks_measurements == {'t1_task_id': {'cpu_usage': 13}}
    assert tasks_resources == {'t1_task_id': {'cpu': 3}}
    assert tasks_labels == {
        't1_task_id': {
            'initial_task_cpu_assignment': 'unknown',
            'label_key': 'label_value',
            'task_id': 't1_task_id'
        }
    }


@patch('wca.cgroups.Cgroup')
@patch('wca.resctrl.ResGroup.get_measurements',
       side_effect=MissingMeasurementException())
@patch('wca.perf.PerfCounters')
def test_prepare_task_data_resgroup_not_found(*mocks):
    rdt_information = RDTInformation(True, True, True, True, '0', '0', 0, 0, 0)
    containers = {
        task('/t1', labels={'label_key': 'label_value'}, resources={'cpu': 3}):
        Container('/t1', 1, 1, rdt_information, resgroup=ResGroup('/t1'))
    }
    tasks_measurements, tasks_resources, tasks_labels = \
        _prepare_tasks_data(containers)
    assert tasks_measurements == {}


@patch('wca.cgroups.Cgroup.get_measurements',
       side_effect=MissingMeasurementException())
@patch('wca.perf.PerfCounters')
Ejemplo n.º 7
0
    def get_measurements(self) -> Measurements:
        try:
            with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE)) as \
                    cpu_usage_file:
                # scale to seconds
                cpu_usage = int(cpu_usage_file.read()) / 1e9
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Cpu usage unavailable.'.format(
                    e.filename))

        measurements = {MetricName.TASK_CPU_USAGE_SECONDS: cpu_usage}

        for cgroup_resource, metric_name in [
            [CgroupResource.MEMORY_USAGE, MetricName.TASK_MEM_USAGE_BYTES],
            [
                CgroupResource.MEMORY_MAX_USAGE,
                MetricName.TASK_MEM_MAX_USAGE_BYTES
            ],
            [CgroupResource.MEMORY_LIMIT, MetricName.TASK_MEM_LIMIT_BYTES],
            [
                CgroupResource.MEMORY_SOFT_LIMIT,
                MetricName.TASK_MEM_SOFT_LIMIT_BYTES
            ],
        ]:
            try:
                with open(
                        os.path.join(self.cgroup_memory_fullpath,
                                     cgroup_resource)) as resource_file:
                    value = int(resource_file.read())
                measurements[metric_name] = value
            except FileNotFoundError as e:
                raise MissingMeasurementException(
                    'File {} is missing. Metric unavailable.'.format(
                        e.filename))

        # Memory stat - e.g. page faults
        try:
            with open(
                    os.path.join(self.cgroup_memory_fullpath,
                                 CgroupResource.MEMORY_STAT)) as resource_file:
                for line in resource_file.readlines():
                    if line.startswith('pgfault'):
                        _, value = line.split()
                        measurements[MetricName.TASK_MEM_PAGE_FAULTS] = int(
                            value)
                        break
        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Metric unavailable.'.format(e.filename))

        def get_metric(metric):
            with open(
                    os.path.join(self.cgroup_memory_fullpath,
                                 CgroupResource.NUMA_STAT)) as resource_file:
                for line in resource_file.readlines():
                    # Requires mem.use_hierarchy = 1
                    if line.startswith(metric):
                        for stat in line.split()[1:]:
                            k, v = stat.split("=")
                            k, v = int(k[1:]), int(v)
                            if MetricName.TASK_MEM_NUMA_PAGES not in measurements:
                                measurements[
                                    MetricName.TASK_MEM_NUMA_PAGES] = {
                                        k: v
                                    }
                            else:
                                measurements[
                                    MetricName.TASK_MEM_NUMA_PAGES][k] = v
                        break

        try:
            has_hierarchical_metrics = False
            get_metric("hierarchical_total=")
            if not has_hierarchical_metrics:
                # NOTE: because we have no nested containers support
                # total is ok and we do not need hierarhical total
                # because we're alread collecting per container and aggregate
                # for Pod
                log.log(
                    logger.TRACE, "No hierarchical_total in NUMA "
                    "memory stat for tasks in cgroup. Using total=.")

                # import warnings
                # warnings.warn(
                #     "No hierarchical_total in NUMA memory stat for tasks in cgroup. Using total=."
                # )
                get_metric("total=")

        except FileNotFoundError as e:
            raise MissingMeasurementException(
                'File {} is missing. Metric unavailable.'.format(e.filename))

        # Check whether consecutive keys.
        assert (MetricName.TASK_MEM_NUMA_PAGES not in measurements
                or list(measurements[MetricName.TASK_MEM_NUMA_PAGES].keys())
                == [el for el in range(0, self.platform.numa_nodes)])

        return measurements