def get_measurements(self, mongroup_name, mb_monitoring_enabled, cache_monitoring_enabled) -> Measurements: """ mbm_total: Memory bandwidth - type: counter, unit: [bytes] :return: Dictionary containing memory bandwidth and cpu usage measurements """ mbm_total = 0 llc_occupancy = 0 def _get_event_file(socket_dir, event_name): return os.path.join(self.fullpath, MON_GROUPS, mongroup_name, MON_DATA, socket_dir, event_name) # Iterate over sockets to gather data: try: for socket_dir in os.listdir(os.path.join(self.fullpath, MON_GROUPS, mongroup_name, MON_DATA)): if mb_monitoring_enabled: with open(_get_event_file(socket_dir, MBM_TOTAL)) as mbm_total_file: mbm_total += int(mbm_total_file.read()) if cache_monitoring_enabled: with open(_get_event_file(socket_dir, LLC_OCCUPANCY)) as llc_occupancy_file: llc_occupancy += int(llc_occupancy_file.read()) except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Measurement unavailable.'.format( e.filename)) measurements = {} if mb_monitoring_enabled: measurements[MetricName.MEM_BW] = mbm_total if cache_monitoring_enabled: measurements[MetricName.LLC_OCCUPANCY] = llc_occupancy return measurements
def reset_counters(self): """Reset counters managed by cgroup abstraction. After one of the container from Pod restarts, the cgroup is reused, but cpuacct.usage still holds a value from previus unsucessful runs. For multicontainer Pods there is a problem when long lived sum of counters from all containers is slightly decreased, because any decrease in counter is treated by Prometheus as reset and then it is assumed that "total value" is last period decrese causing unrealistics spikes in counter rate/increase. There are two solitions: 1. Make sure that after we reintialize cgroups, we will reset all the counters (This solution) 2. Do not aggregate counters (by summing), but expose them as as additional time series (new metrics per container) (To be considered, but requires API change (new levels) for some metrics) ps. Cgroups solutions works because all other metrics: are gauages (NUMA), collected per POD (RDT) or properly reset to 0 (perf counters) """ try: with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE), 'w') as \ cpu_usage_file: cpu_usage_file.write('0') except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Cpu usage (to perfom reset) unavailable.'. format(e.filename))
def _get_cgroup_fd(cgroup) -> int: """ Return FD for provided cgroup """ path = os.path.join('/sys/fs/cgroup/perf_event', cgroup) # cgroup is a directory, so we can't use fdopen on the file # descriptor we receive from os.open try: return os.open(path, os.O_RDONLY) except FileNotFoundError: raise MissingMeasurementException( 'cannot initialize perf for cgroup %r - directory not found' % cgroup)
def get_measurements(self) -> Measurements: try: with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE)) as \ cpu_usage_file: cpu_usage = int(cpu_usage_file.read()) except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Cpu usage unavailable.'.format(e.filename)) measurements = {MetricName.CPU_USAGE_PER_TASK: cpu_usage} try: with open(os.path.join(self.cgroup_memory_fullpath, CgroupResource.MEMORY_USAGE)) as memory_usage_file: memory_usage = int(memory_usage_file.read()) measurements[MetricName.MEM_USAGE_PER_TASK] = memory_usage except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Memory usage unavailable.'.format(e.filename)) return measurements
def get_measurements(self, mongroup_name, mb_monitoring_enabled, cache_monitoring_enabled) -> Measurements: """ mbm_total: Memory bandwidth - type: counter, unit: [bytes] mbm_local: Local memory bandiwdth - type: counter, unit: [bytes] mbm_remote: Remote memory bandwidth - type: counter, unit: [bytes] :return: Dictionary containing memory bandwidth and cpu usage measurements """ mbm_total = 0 mbm_local = 0 llc_occupancy = 0 # Iterate over sockets to gather data: try: for socket_dir in self.get_socket_dirs(mongroup_name): if mb_monitoring_enabled: with open( self.get_event_file( socket_dir, MBM_TOTAL, mongroup_name)) as mbm_total_file: mbm_total += int(mbm_total_file.read()) with open( self.get_event_file( socket_dir, MBM_LOCAL, mongroup_name)) as mbm_local_file: mbm_local += int(mbm_local_file.read()) if cache_monitoring_enabled: with open( self.get_event_file( socket_dir, LLC_OCCUPANCY, mongroup_name)) as llc_occupancy_file: llc_occupancy += int(llc_occupancy_file.read()) except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Measurement unavailable.'.format( e.filename)) measurements = {} if mb_monitoring_enabled: measurements[MetricName.TASK_MEM_BANDWIDTH_BYTES] = mbm_total measurements[MetricName.TASK_MEM_BANDWIDTH_LOCAL_BYTES] = mbm_local measurements[ MetricName. TASK_MEM_BANDWIDTH_REMOTE_BYTES] = mbm_total - mbm_local if cache_monitoring_enabled: measurements[MetricName.TASK_LLC_OCCUPANCY_BYTES] = llc_occupancy return measurements
containers) assert tasks_measurements == {'t1_task_id': {'cpu_usage': 13}} assert tasks_resources == {'t1_task_id': {'cpu': 3}} assert tasks_labels == { 't1_task_id': { 'initial_task_cpu_assignment': 'unknown', 'label_key': 'label_value', 'task_id': 't1_task_id' } } @patch('wca.cgroups.Cgroup') @patch('wca.resctrl.ResGroup.get_measurements', side_effect=MissingMeasurementException()) @patch('wca.perf.PerfCounters') def test_prepare_task_data_resgroup_not_found(*mocks): rdt_information = RDTInformation(True, True, True, True, '0', '0', 0, 0, 0) containers = { task('/t1', labels={'label_key': 'label_value'}, resources={'cpu': 3}): Container('/t1', 1, 1, rdt_information, resgroup=ResGroup('/t1')) } tasks_measurements, tasks_resources, tasks_labels = \ _prepare_tasks_data(containers) assert tasks_measurements == {} @patch('wca.cgroups.Cgroup.get_measurements', side_effect=MissingMeasurementException()) @patch('wca.perf.PerfCounters')
def get_measurements(self) -> Measurements: try: with open(os.path.join(self.cgroup_cpu_fullpath, CgroupResource.CPU_USAGE)) as \ cpu_usage_file: # scale to seconds cpu_usage = int(cpu_usage_file.read()) / 1e9 except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Cpu usage unavailable.'.format( e.filename)) measurements = {MetricName.TASK_CPU_USAGE_SECONDS: cpu_usage} for cgroup_resource, metric_name in [ [CgroupResource.MEMORY_USAGE, MetricName.TASK_MEM_USAGE_BYTES], [ CgroupResource.MEMORY_MAX_USAGE, MetricName.TASK_MEM_MAX_USAGE_BYTES ], [CgroupResource.MEMORY_LIMIT, MetricName.TASK_MEM_LIMIT_BYTES], [ CgroupResource.MEMORY_SOFT_LIMIT, MetricName.TASK_MEM_SOFT_LIMIT_BYTES ], ]: try: with open( os.path.join(self.cgroup_memory_fullpath, cgroup_resource)) as resource_file: value = int(resource_file.read()) measurements[metric_name] = value except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Metric unavailable.'.format( e.filename)) # Memory stat - e.g. page faults try: with open( os.path.join(self.cgroup_memory_fullpath, CgroupResource.MEMORY_STAT)) as resource_file: for line in resource_file.readlines(): if line.startswith('pgfault'): _, value = line.split() measurements[MetricName.TASK_MEM_PAGE_FAULTS] = int( value) break except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Metric unavailable.'.format(e.filename)) def get_metric(metric): with open( os.path.join(self.cgroup_memory_fullpath, CgroupResource.NUMA_STAT)) as resource_file: for line in resource_file.readlines(): # Requires mem.use_hierarchy = 1 if line.startswith(metric): for stat in line.split()[1:]: k, v = stat.split("=") k, v = int(k[1:]), int(v) if MetricName.TASK_MEM_NUMA_PAGES not in measurements: measurements[ MetricName.TASK_MEM_NUMA_PAGES] = { k: v } else: measurements[ MetricName.TASK_MEM_NUMA_PAGES][k] = v break try: has_hierarchical_metrics = False get_metric("hierarchical_total=") if not has_hierarchical_metrics: # NOTE: because we have no nested containers support # total is ok and we do not need hierarhical total # because we're alread collecting per container and aggregate # for Pod log.log( logger.TRACE, "No hierarchical_total in NUMA " "memory stat for tasks in cgroup. Using total=.") # import warnings # warnings.warn( # "No hierarchical_total in NUMA memory stat for tasks in cgroup. Using total=." # ) get_metric("total=") except FileNotFoundError as e: raise MissingMeasurementException( 'File {} is missing. Metric unavailable.'.format(e.filename)) # Check whether consecutive keys. assert (MetricName.TASK_MEM_NUMA_PAGES not in measurements or list(measurements[MetricName.TASK_MEM_NUMA_PAGES].keys()) == [el for el in range(0, self.platform.numa_nodes)]) return measurements