Example #1
0
def get_violations():
    return json.dumps({
        "cross_package":
        get_cross_package_violations(get_workload_manager().get_cpu()),
        "shared_core":
        get_shared_core_violations(get_workload_manager().get_cpu())
    })
Example #2
0
def get_wm_status():
    return json.dumps({
        "workload_manager": {
            "cpu_allocator":
            get_workload_manager().get_allocator_name(),
            "workload_count":
            len(get_workload_manager().get_workloads()),
            "isolated_workload_count":
            len(get_workload_manager().get_isolated_workload_ids())
        }
    })
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug("Not reporting metrics because there's no registry available yet.")
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        workload_ids = wm.get_workload_map_copy().keys()
        usage_dict = self.__get_usage_dict(workload_ids)
        if CPU_USAGE not in usage_dict.keys():
            log.warning("No CPU usage in usage: %s", usage_dict)
            return

        usage = usage_dict[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)

        with self.__metric_lock:
            self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count)
            self.__get_resource_usage_failure_count = 0
Example #4
0
    def __get_workloads():
        wm = get_workload_manager()
        if wm is None:
            log.debug("Workload manager not yet present.")
            return []

        return wm.get_workloads()
Example #5
0
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug(
                "Not reporting metrics because there's no registry available yet."
            )
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug(
                "Not reporting metrics because there's no workload manager available yet."
            )
            return

        pcp_usage = self.get_pcp_usage()
        if CPU_USAGE not in pcp_usage.keys():
            log.warning("No CPU usage in PCP usage.")
            return

        usage = pcp_usage[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY,
                              tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY,
                              tags).set(burst_pool_cpu_usage)
Example #6
0
def isolate_workload(workload_id):
    # We acquire a lock here to serialize callers and protect against contention with actual isolation work.
    if not __isolate_lock.acquire(timeout=0.1):
        log.warn("timeout getting isolate lock for workload: {}".format(
            workload_id))
        return json.dumps({'workload_id': workload_id}), 404, {
            'ContentType': 'application/json'
        }

    start_time = time.time()

    if get_workload_manager().is_isolated(workload_id):
        stop_time = time.time()
        if metrics_manager is not None:
            start_time = __isolate_latency.pop(workload_id, start_time)
            duration = stop_time - start_time
            registry.distribution_summary(
                ISOLATE_LATENCY_KEY,
                metrics_manager.get_tags()).record(duration)

        __isolate_lock.release()
        log.info("workload: '{}' IS isolated".format(workload_id))
        return json.dumps({'workload_id': workload_id}), 200, {
            'ContentType': 'application/json'
        }

    log.info("workload: '{}' is NOT isolated".format(workload_id))
    if workload_id not in __isolate_latency:
        __isolate_latency[workload_id] = time.time()

    __isolate_lock.release()
    return json.dumps({'unknown_workload_id': workload_id}), 404, {
        'ContentType': 'application/json'
    }
Example #7
0
def get_wm_status():
    return json.dumps({
        "event_manager": {
            "queue_depth": get_event_manager().get_queue_depth(),
            "success_count": get_event_manager().get_success_count(),
            "error_count": get_event_manager().get_error_count(),
            "processed_count": get_event_manager().get_processed_count()
        },
        "workload_manager": {
            "cpu_allocator":
            get_workload_manager().get_allocator_name(),
            "workload_count":
            len(get_workload_manager().get_workloads()),
            "isolated_workload_count":
            len(get_workload_manager().get_isolated_workload_ids()),
            "success_count":
            get_workload_manager().get_success_count(),
            "error_count":
            get_workload_manager().get_error_count(),
            "added_count":
            get_workload_manager().get_added_count(),
            "removed_count":
            get_workload_manager().get_removed_count()
        }
    })
Example #8
0
    def handle(self, event):
        if not self.__relevant(event):
            return

        cpu = copy.deepcopy(get_workload_manager().get_cpu())
        self.handling_event(event,
                            "reconciling titus-isolate and cgroup state")
        self.__reconciler.reconcile(cpu)
        self.handled_event(event, "reconciled titus-isolate and cgroup state")
Example #9
0
    def get_isolated_workload_ids(self):
        with self.__lock:
            wm = get_workload_manager()
            if wm is None:
                return set([])

            workloads = wm.get_workloads()
            workload_ids = set([w.get_id() for w in workloads])
            self.__isolated_workload_ids = self.__isolated_workload_ids.intersection(
                workload_ids)
            return copy.deepcopy(self.__isolated_workload_ids)
    def __snapshot_usage_raw(self):
        try:
            # Avoid making a metrics query on a potentially empty dataset which causes the query command to fail, which
            # causes noisy logs which look like failures.
            workload_manager = get_workload_manager()
            if workload_manager is None or len(
                    workload_manager.get_workloads()) == 0:
                log.info('No workloads so skipping pcp snapshot.')
                return

            instance_filter = "INVALID_INSTANCE_FILTER"
            if is_kubernetes():
                instance_filter = '.*titus-executor.*.service'
            else:
                instance_filter = '/containers.slice/[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}'

            # pmrep -a /var/log/pcp/pmlogger/$(hostname)/ -S -60m -t 1m -y s -o csv -i .*titus-executor.*.service  cgroup.cpuacct.usage cgroup.memory.usage
            snapshot_cmd_fmt = """ pmrep -a {0} \
                    -S -{1}s \
                    -T -0s \
                    -t {2}s \
                    -y s \
                    -o csv \
                    -i {3} \
                    cgroup.cpuacct.usage \
                    cgroup.memory.usage \
                    titus.network.in.bytes \
                    titus.network.out.bytes \
                    titus.disk.bytes_used """

            cmd_str = snapshot_cmd_fmt.format(get_pcp_archive_path(),
                                              self.__relative_start_sec,
                                              self.__interval_sec,
                                              instance_filter)

            log.info('Snapshoting usage from pcp: {}'.format(' '.join(
                cmd_str.split())))

            byte_array = subprocess.check_output(
                cmd_str, shell=True, timeout=self.__query_timeout_sec)
            raw_csv_snapshot = byte_array.decode('utf-8')
            usages = get_resource_usage(raw_csv_snapshot,
                                        self.__interval_count,
                                        self.__interval_sec)

            with self.__lock:
                self.__usages = usages
        except:
            log.exception("Failed to snapshot pcp data or compute usages")
Example #11
0
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug("Not reporting metrics because there's no registry available yet.")
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        usage = self.get_cpu_usage(60, 60)
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
Example #12
0
def isolate_workload(workload_id, timeout=None):
    if timeout is None:
        timeout = get_config_manager().get_float(
            TITUS_ISOLATE_BLOCK_SEC, DEFAULT_TITUS_ISOLATE_BLOCK_SEC)

    deadline = time.time() + timeout
    while time.time() < deadline:
        if get_workload_manager().is_isolated(workload_id):
            return json.dumps({'workload_id': workload_id}), 200, {
                'ContentType': 'application/json'
            }
        time.sleep(0.1)

    log.error("Failed to isolate workload: '{}'".format(workload_id))
    return json.dumps({'unknown_workload_id': workload_id}), 404, {
        'ContentType': 'application/json'
    }
    def __get_pool_usage(workload_type, usage):
        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        workload_map = wm.get_workload_map_copy()

        pool_cpu_usage = 0.0
        for w_id, usage in usage.items():
            if w_id not in workload_map:
                continue

            workload = workload_map[w_id]
            if workload.get_type() == workload_type:
                pool_cpu_usage += float(usage[len(usage) - 1])

        return pool_cpu_usage
Example #14
0
    def get_tags():
        ec2_instance_id = 'EC2_INSTANCE_ID'

        tags = {}
        if ec2_instance_id in os.environ:
            tags["node"] = os.environ[ec2_instance_id]
            tags["nf.node"] = os.environ[ec2_instance_id]

        wm = get_workload_manager()
        if wm is None:
            allocator_name = UNKNOWN_CPU_ALLOCATOR
        else:
            allocator_name = wm.get_allocator_name()

        tags[CPU_ALLOCATOR] = allocator_name
        tags[CELL] = get_cell_name()

        return tags
Example #15
0
def get_isolated_workload_ids():
    return json.dumps(list(get_workload_manager().get_isolated_workload_ids()))
Example #16
0
def get_workloads():
    workloads = [w.to_dict() for w in get_workload_manager().get_workloads()]
    return json.dumps(workloads)
Example #17
0
def get_cpu():
    return json.dumps(get_workload_manager().get_cpu().to_dict())