def get_violations(): return json.dumps({ "cross_package": get_cross_package_violations(get_workload_manager().get_cpu()), "shared_core": get_shared_core_violations(get_workload_manager().get_cpu()) })
def get_wm_status(): return json.dumps({ "workload_manager": { "cpu_allocator": get_workload_manager().get_allocator_name(), "workload_count": len(get_workload_manager().get_workloads()), "isolated_workload_count": len(get_workload_manager().get_isolated_workload_ids()) } })
def report_metrics(self, tags): if self.__registry is None: log.debug("Not reporting metrics because there's no registry available yet.") return wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return workload_ids = wm.get_workload_map_copy().keys() usage_dict = self.__get_usage_dict(workload_ids) if CPU_USAGE not in usage_dict.keys(): log.warning("No CPU usage in usage: %s", usage_dict) return usage = usage_dict[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage) with self.__metric_lock: self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count) self.__get_resource_usage_failure_count = 0
def __get_workloads(): wm = get_workload_manager() if wm is None: log.debug("Workload manager not yet present.") return [] return wm.get_workloads()
def report_metrics(self, tags): if self.__registry is None: log.debug( "Not reporting metrics because there's no registry available yet." ) return wm = get_workload_manager() if wm is None: log.debug( "Not reporting metrics because there's no workload manager available yet." ) return pcp_usage = self.get_pcp_usage() if CPU_USAGE not in pcp_usage.keys(): log.warning("No CPU usage in PCP usage.") return usage = pcp_usage[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
def isolate_workload(workload_id): # We acquire a lock here to serialize callers and protect against contention with actual isolation work. if not __isolate_lock.acquire(timeout=0.1): log.warn("timeout getting isolate lock for workload: {}".format( workload_id)) return json.dumps({'workload_id': workload_id}), 404, { 'ContentType': 'application/json' } start_time = time.time() if get_workload_manager().is_isolated(workload_id): stop_time = time.time() if metrics_manager is not None: start_time = __isolate_latency.pop(workload_id, start_time) duration = stop_time - start_time registry.distribution_summary( ISOLATE_LATENCY_KEY, metrics_manager.get_tags()).record(duration) __isolate_lock.release() log.info("workload: '{}' IS isolated".format(workload_id)) return json.dumps({'workload_id': workload_id}), 200, { 'ContentType': 'application/json' } log.info("workload: '{}' is NOT isolated".format(workload_id)) if workload_id not in __isolate_latency: __isolate_latency[workload_id] = time.time() __isolate_lock.release() return json.dumps({'unknown_workload_id': workload_id}), 404, { 'ContentType': 'application/json' }
def get_wm_status(): return json.dumps({ "event_manager": { "queue_depth": get_event_manager().get_queue_depth(), "success_count": get_event_manager().get_success_count(), "error_count": get_event_manager().get_error_count(), "processed_count": get_event_manager().get_processed_count() }, "workload_manager": { "cpu_allocator": get_workload_manager().get_allocator_name(), "workload_count": len(get_workload_manager().get_workloads()), "isolated_workload_count": len(get_workload_manager().get_isolated_workload_ids()), "success_count": get_workload_manager().get_success_count(), "error_count": get_workload_manager().get_error_count(), "added_count": get_workload_manager().get_added_count(), "removed_count": get_workload_manager().get_removed_count() } })
def handle(self, event): if not self.__relevant(event): return cpu = copy.deepcopy(get_workload_manager().get_cpu()) self.handling_event(event, "reconciling titus-isolate and cgroup state") self.__reconciler.reconcile(cpu) self.handled_event(event, "reconciled titus-isolate and cgroup state")
def get_isolated_workload_ids(self): with self.__lock: wm = get_workload_manager() if wm is None: return set([]) workloads = wm.get_workloads() workload_ids = set([w.get_id() for w in workloads]) self.__isolated_workload_ids = self.__isolated_workload_ids.intersection( workload_ids) return copy.deepcopy(self.__isolated_workload_ids)
def __snapshot_usage_raw(self): try: # Avoid making a metrics query on a potentially empty dataset which causes the query command to fail, which # causes noisy logs which look like failures. workload_manager = get_workload_manager() if workload_manager is None or len( workload_manager.get_workloads()) == 0: log.info('No workloads so skipping pcp snapshot.') return instance_filter = "INVALID_INSTANCE_FILTER" if is_kubernetes(): instance_filter = '.*titus-executor.*.service' else: instance_filter = '/containers.slice/[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}' # pmrep -a /var/log/pcp/pmlogger/$(hostname)/ -S -60m -t 1m -y s -o csv -i .*titus-executor.*.service cgroup.cpuacct.usage cgroup.memory.usage snapshot_cmd_fmt = """ pmrep -a {0} \ -S -{1}s \ -T -0s \ -t {2}s \ -y s \ -o csv \ -i {3} \ cgroup.cpuacct.usage \ cgroup.memory.usage \ titus.network.in.bytes \ titus.network.out.bytes \ titus.disk.bytes_used """ cmd_str = snapshot_cmd_fmt.format(get_pcp_archive_path(), self.__relative_start_sec, self.__interval_sec, instance_filter) log.info('Snapshoting usage from pcp: {}'.format(' '.join( cmd_str.split()))) byte_array = subprocess.check_output( cmd_str, shell=True, timeout=self.__query_timeout_sec) raw_csv_snapshot = byte_array.decode('utf-8') usages = get_resource_usage(raw_csv_snapshot, self.__interval_count, self.__interval_sec) with self.__lock: self.__usages = usages except: log.exception("Failed to snapshot pcp data or compute usages")
def report_metrics(self, tags): if self.__registry is None: log.debug("Not reporting metrics because there's no registry available yet.") return wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return usage = self.get_cpu_usage(60, 60) static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
def isolate_workload(workload_id, timeout=None): if timeout is None: timeout = get_config_manager().get_float( TITUS_ISOLATE_BLOCK_SEC, DEFAULT_TITUS_ISOLATE_BLOCK_SEC) deadline = time.time() + timeout while time.time() < deadline: if get_workload_manager().is_isolated(workload_id): return json.dumps({'workload_id': workload_id}), 200, { 'ContentType': 'application/json' } time.sleep(0.1) log.error("Failed to isolate workload: '{}'".format(workload_id)) return json.dumps({'unknown_workload_id': workload_id}), 404, { 'ContentType': 'application/json' }
def __get_pool_usage(workload_type, usage): wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return workload_map = wm.get_workload_map_copy() pool_cpu_usage = 0.0 for w_id, usage in usage.items(): if w_id not in workload_map: continue workload = workload_map[w_id] if workload.get_type() == workload_type: pool_cpu_usage += float(usage[len(usage) - 1]) return pool_cpu_usage
def get_tags(): ec2_instance_id = 'EC2_INSTANCE_ID' tags = {} if ec2_instance_id in os.environ: tags["node"] = os.environ[ec2_instance_id] tags["nf.node"] = os.environ[ec2_instance_id] wm = get_workload_manager() if wm is None: allocator_name = UNKNOWN_CPU_ALLOCATOR else: allocator_name = wm.get_allocator_name() tags[CPU_ALLOCATOR] = allocator_name tags[CELL] = get_cell_name() return tags
def get_isolated_workload_ids(): return json.dumps(list(get_workload_manager().get_isolated_workload_ids()))
def get_workloads(): workloads = [w.to_dict() for w in get_workload_manager().get_workloads()] return json.dumps(workloads)
def get_cpu(): return json.dumps(get_workload_manager().get_cpu().to_dict())