Ejemplo n.º 1
0
    def __update_workload(self, func, arg, workload_id):
        try:
            with self.__lock:
                log.debug("Acquired lock for func: {} on workload: {}".format(
                    func.__name__, workload_id))
                start_time = time.time()
                func(arg)
                stop_time = time.time()
                if self.__reg is not None:
                    self.__reg.distribution_summary(
                        self.__get_workload_processing_metric_name(
                            func.__name__),
                        self.__tags).record(stop_time - start_time)
                    self.__reg.distribution_summary(
                        WORKLOAD_PROCESSING_DURATION,
                        self.__tags).record(stop_time - start_time)

            log.debug("Released lock for func: {} on workload: {}".format(
                func.__name__, workload_id))
            return True
        except:
            self.__error_count += 1
            log.exception("Failed to execute func: {} on workload: {}".format(
                func.__name__, workload_id))
            return False
Ejemplo n.º 2
0
    def assign_threads(self, workload):
        thread_count = workload.get_thread_count()
        claimed_threads = []

        if thread_count == 0:
            return claimed_threads

        log.info("Assigning '{}' thread(s) to workload: '{}'".format(workload.get_thread_count(), workload.get_id()))

        if is_cpu_full(self.__cpu):
            raise ValueError("Cannot assign workload: '{}' to full CPU.".format(workload.get_id()))

        package = self.__cpu.get_emptiest_package()

        while thread_count > 0 and len(package.get_empty_threads()) > 0:
            core = get_emptiest_core(package)
            empty_threads = core.get_empty_threads()[:thread_count]

            for empty_thread in empty_threads:
                log.debug("Claiming package:core:thread '{}:{}:{}' for workload '{}'".format(
                    package.get_id(), core.get_id(), empty_thread.get_id(), workload.get_id()))
                empty_thread.claim(workload.get_id())
                claimed_threads.append(empty_thread)
                thread_count -= 1

        return claimed_threads + self.assign_threads(Workload(workload.get_id(), thread_count, workload.get_type()))
    def __process_events(self):
        while not self.__stopped:
            try:
                event = self.__q.get(timeout=self.__event_timeout)
                dequeue_time = time.time()
                log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth()))
                if self.__reg is not None:
                    self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment()
                    self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment()
                    self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY])
            except Empty:
                log.debug("Timed out waiting for event on queue.")
                continue

            for event_handler in self.__event_handlers:
                try:
                    log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION]))
                    event_handler.handle(event)
                    self.__report_succeeded_event(event_handler)
                except Exception:
                    log.error("Event handler: '{}' failed to handle event: '{}'".format(
                        type(event_handler).__name__, event))
                    self.__report_failed_event(event_handler)

            self.__q.task_done()
            self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment()
            self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth())
            self.__processed_count += 1
    def __get_rebalance_request(self):
        workload_map = self.get_workload_map_copy()
        resource_usage = self.__wmm.get_resource_usage(workload_map.keys())
        log.debug("resource_usage: %s", json.dumps(resource_usage.serialize()))
        cpu_usage = self.__get_optional_default(resource_usage.get_cpu_usage,
                                                {})
        mem_usage = self.__get_optional_default(resource_usage.get_mem_usage,
                                                {})
        net_recv_usage = self.__get_optional_default(
            resource_usage.get_net_recv_usage, {})
        net_trans_usage = self.__get_optional_default(
            resource_usage.get_net_trans_usage, {})
        disk_usage = self.__get_optional_default(resource_usage.get_disk_usage,
                                                 {})

        return AllocateRequest(
            cpu=self.get_cpu_copy(),
            workloads=workload_map,
            resource_usage=resource_usage,
            cpu_usage=cpu_usage,
            mem_usage=mem_usage,
            net_recv_usage=net_recv_usage,
            net_trans_usage=net_trans_usage,
            disk_usage=disk_usage,
            metadata=self.__get_request_metadata("rebalance"))
Ejemplo n.º 5
0
    def __get_workloads():
        wm = get_workload_manager()
        if wm is None:
            log.debug("Workload manager not yet present.")
            return []

        return wm.get_workloads()
Ejemplo n.º 6
0
    def __predict_usage(self, workloads, cpu_usage):
        res = {}
        cpu_usage_predictor = self.__get_cpu_usage_predictor()

        cm = self.__config_manager
        pred_env = PredEnvironment(cm.get_region(), cm.get_environment(),
                                   dt.utcnow().hour)

        start_time = time.time()
        for w in workloads.values():  # TODO: batch the call
            # TODO: Integrate new prediction service
            pred = w.get_thread_count()
            if type(cpu_usage_predictor) is CpuUsagePredictor:
                pred = cpu_usage_predictor.predict(
                    w, cpu_usage.get(w.get_id(), None), pred_env)
                log.info("Predicted cpu usage: %s for workload: %s", pred,
                         w.get_id())
            else:
                log.info("Not predicting cpu usage for workload: %s",
                         w.get_id())
            res[w.get_id()] = pred
        stop_time = time.time()
        self.__call_meta['pred_cpu_usage_dur_secs'] = stop_time - start_time
        try:
            self.__call_meta[
                'pred_cpu_usage_model_id'] = cpu_usage_predictor.get_model(
                ).meta_data['model_training_titus_task_id']
        except Exception:
            self.__call_meta['pred_cpu_usage_model_id'] = 'unknown'

        log.debug("Usage prediction per workload: " + str(res))
        if len(res) > 0:
            self.__call_meta['pred_cpu_usage'] = dict(res)
        return res
Ejemplo n.º 7
0
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug(
                "Not reporting metrics because there's no registry available yet."
            )
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug(
                "Not reporting metrics because there's no workload manager available yet."
            )
            return

        pcp_usage = self.get_pcp_usage()
        if CPU_USAGE not in pcp_usage.keys():
            log.warning("No CPU usage in PCP usage.")
            return

        usage = pcp_usage[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY,
                              tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY,
                              tags).set(burst_pool_cpu_usage)
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug("Not reporting metrics because there's no registry available yet.")
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        workload_ids = wm.get_workload_map_copy().keys()
        usage_dict = self.__get_usage_dict(workload_ids)
        if CPU_USAGE not in usage_dict.keys():
            log.warning("No CPU usage in usage: %s", usage_dict)
            return

        usage = usage_dict[CPU_USAGE]
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)

        with self.__metric_lock:
            self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count)
            self.__get_resource_usage_failure_count = 0
Ejemplo n.º 9
0
    def test_fill_cpu(self):
        """
        Workload 0: 8 cores
        Workload 1: 4 cores
        Workload 2: 2 cores
        Workload 3: 1 core
        Workload 4: 1 core
        --------------------
        Total:      16 cores
        """
        for allocator in ALLOCATORS:
            cpu = get_cpu()
            workloads = [
                get_test_workload("a", 8, STATIC),
                get_test_workload("b", 4, STATIC),
                get_test_workload("c", 2, STATIC),
                get_test_workload("d", 1, STATIC),
                get_test_workload("e", 1, STATIC)
            ]

            tot_req = 0
            workload_map = {}
            for w in workloads:
                workload_map[w.get_id()] = w
                request = AllocateThreadsRequest(
                    cpu, w.get_id(), workload_map, {},
                    DEFAULT_TEST_REQUEST_METADATA)
                cpu = allocator.assign_threads(request).get_cpu()
                log.debug("{}".format(cpu))
                tot_req += w.get_thread_count()
                self.assertEqual(tot_req, len(cpu.get_claimed_threads()))
    def add_window(self, start: datetime, end: datetime, free_cpu_count: int):
        node = self.__get_node()
        log.debug('owner_kind:%s owner_name:%s owner_uid:%s', node.kind,
                  node.metadata.name, node.metadata.uid)
        start_epoch_ms = int(unix_time_millis(start))
        end_epoch_ms = int(unix_time_millis(end))

        oppo_meta = V1ObjectMeta(
            namespace=OPPORTUNISTIC_RESOURCE_NAMESPACE,
            name="{}-{}-{}".format(node.metadata.name, start_epoch_ms,
                                   end_epoch_ms),
            labels={
                OPPORTUNISTIC_RESOURCE_NODE_NAME_LABEL_KEY: node.metadata.name,
                OPPORTUNISTIC_RESOURCE_NODE_UID_LABEL_KEY: node.metadata.uid
            },
            owner_references=[
                V1OwnerReference(api_version=node.api_version,
                                 kind=node.kind,
                                 name=node.metadata.name,
                                 uid=node.metadata.uid)
            ])
        oppo_spec = OpportunisticResourceSpec(
            capacity=OpportunisticResourceCapacity(free_cpu_count),
            window=OpportunisticResourceWindow(start_epoch_ms, end_epoch_ms))
        oppo_body = OpportunisticResource(metadata=oppo_meta, spec=oppo_spec)
        oppo = self.__custom_api.create_namespaced_custom_object(
            version=OPPORTUNISTIC_RESOURCE_VERSION,
            group=OPPORTUNISTIC_RESOURCE_GROUP,
            plural=OPPORTUNISTIC_RESOURCE_PLURAL,
            namespace=OPPORTUNISTIC_RESOURCE_NAMESPACE,
            body=oppo_body)
        log.debug('created window: %s', json.dumps(oppo))
Ejemplo n.º 11
0
    def set_cpuset(self, container_name, thread_ids):
        log.debug("Updating container: '{}' to cpuset: '{}'".format(container_name, thread_ids))
        self.container_update_map[container_name] = thread_ids

        if container_name not in self.container_update_counts:
            self.container_update_counts[container_name] = 1
        else:
            self.container_update_counts[container_name] += 1
Ejemplo n.º 12
0
    def __init__(self, total_threshold: float):
        """
        This class determines whether threads are free based on the cpu usage of workloads.

        :param total_threshold: The percentage of usage under which threads are considered to be free.
        """
        self.__threshold = total_threshold
        log.debug("{} created with threshold: '{}'".format(self.__class__.__name__, self.__threshold))
Ejemplo n.º 13
0
 def __set(self, func: FunctionType, container_name: str, value: str):
     try:
         func(container_name, value)
         self.__write_succeeded(container_name)
     except Exception:
         self.__write_failed()
         log.debug("Failed to apply func: {} with value: {} to container: {}".format(
             func.__name__, value, container_name))
Ejemplo n.º 14
0
def get_latest_cpu_model():
    models = get_cpu_models()
    if models is None or len(models) == 0:
        return None

    models = sorted(models, key=lambda e: e['LastModified'], reverse=True)
    log.debug("sorted models: {}".format(models))
    return models[0]
 def is_window_active(self) -> bool:
     oppo_list = self.__get_scoped_opportunistic_resources()
     log.debug('is active: oppo list: %s', json.dumps(oppo_list))
     for item in oppo_list['items']:
         log.debug('checking for window: %s', json.dumps(item))
         now = datetime.utcnow()
         if now < self.__get_timestamp(item['spec']['window']['end']):
             return True
     return False
 def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage:
     try:
         global_usage = GlobalResourceUsage(self.__get_usage_dict(workload_ids))
         log.debug("Got resource usage: %s", json.dumps(global_usage.serialize(), sort_keys=True, separators=(',', ':')))
         return global_usage
     except Exception:
         log.error("failed to get resource usage, returning empty usage")
         with self.__metric_lock:
             self.__get_resource_usage_failure_count += 1
         return GlobalResourceUsage({})
    def rebalance(self, request: AllocateRequest) -> AllocateResponse:
        url = "{}/rebalance".format(self.__url)
        body = request.to_dict()
        log.debug("url: {}, body: {}".format(url, body))
        response = requests.put(url, json=body, headers=self.__headers, timeout=self.__timeout)
        log.debug("rebalance response code: {}".format(response.status_code))

        if response.status_code == 200:
            return deserialize_response(response.headers, response.json())

        raise CpuAllocationException("Failed to rebalance threads: {}".format(response.text))
 def is_window_active(self) -> bool:
     with self.__lock:
         log.debug('is active: oppo list: %s',
                   json.dumps(self.__opportunistic_resources))
         for item in self.__opportunistic_resources.values():
             log.debug('checking for window: %s', json.dumps(item))
             now = datetime.utcnow()
             if now < self.__get_timestamp(
                     item['object']['spec']['window']['end']):
                 return True
         return False
Ejemplo n.º 19
0
def __schedule_loop(exit_handler: ExitHandler):
    log.info("Starting scheduling loop...")
    while True:
        try:
            sleep_time = _schedule_once(exit_handler)
            _notify_watchdog()
            log.debug("Scheduling thread sleeping for: '%d' seconds",
                      sleep_time)
            time.sleep(sleep_time)
        except Exception:
            log.error("Failed to run scheduling loop")
            exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
Ejemplo n.º 20
0
def _wait_for_file_to_exist(path, timeout, check_func=__noop):
    start_time = time.time()
    while not os.path.exists(path):
        log.debug("Waiting for file to exist: '{}'".format(path))
        time.sleep(0.1)

        check_func()

        elapsed_time = time.time() - start_time
        if elapsed_time > timeout:
            raise TimeoutError(
                "Expected file '{}' was not created in '{}' seconds.".format(path, timeout))
Ejemplo n.º 21
0
    def __detect_property_changes(self):
        for p in self.__properties:
            original_value = self.__original_properties[p]
            curr_value = self.__config_manager.get_str(p)
            log.debug("property: '{}' original: '{}' current: '{}'".format(
                p, original_value, curr_value))

            if original_value != curr_value:
                log.info(
                    "Restarting because property: '{}' changed from: '{}' to: '{}'"
                    .format(p, original_value, curr_value))

                self.__exit_handler.exit(GENERIC_PROPERTY_CHANGE_EXIT)
Ejemplo n.º 22
0
    def __groom_events(self):
        while not self.__stopped:
            try:
                event = self.__raw_q.get(timeout=self.__event_timeout)
            except Empty:
                log.debug("Timed out waiting for event on queue.")
                continue

            event = event.decode("utf-8")
            event = json.loads(event)
            self.__event_logger.handle(event)

            Thread(target=self.__groom, args=[event]).start()
Ejemplo n.º 23
0
    def report_metrics(self, tags):
        log.debug("Reporting metrics")
        try:
            # Workload manager metrics
            self.__reg.gauge(RUNNING, tags).set(1)

            self.__reg.gauge(ADDED_KEY, tags).set(
                self.__workload_manager.get_added_count())
            self.__reg.gauge(REMOVED_KEY, tags).set(
                self.__workload_manager.get_removed_count())
            self.__reg.gauge(SUCCEEDED_KEY, tags).set(
                self.__workload_manager.get_success_count())
            self.__reg.gauge(FAILED_KEY, tags).set(
                self.__workload_manager.get_error_count())
            self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set(
                len(self.__workload_manager.get_workloads()))

            # Allocator metrics
            self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set(
                self.__workload_manager.get_allocator_call_duration_sum_secs())
            self.__reg.gauge(FALLBACK_ALLOCATOR_COUNT, tags).set(
                self.__workload_manager.get_fallback_allocator_calls_count())
            self.__reg.gauge(IP_ALLOCATOR_TIMEBOUND_COUNT, tags).set(
                self.__workload_manager.
                get_time_bound_ip_allocator_solution_count())

            # Event manager metrics
            self.__reg.gauge(QUEUE_DEPTH_KEY,
                             tags).set(self.__event_manager.get_queue_depth())
            self.__reg.gauge(EVENT_SUCCEEDED_KEY, tags).set(
                self.__event_manager.get_success_count())
            self.__reg.gauge(EVENT_FAILED_KEY,
                             tags).set(self.__event_manager.get_error_count())
            self.__reg.gauge(EVENT_PROCESSED_KEY, tags).set(
                self.__event_manager.get_processed_count())

            # CPU metrics
            cross_package_violation_count = len(
                get_cross_package_violations(
                    self.__workload_manager.get_cpu()))
            shared_core_violation_count = len(
                get_shared_core_violations(self.__workload_manager.get_cpu()))
            self.__reg.gauge(PACKAGE_VIOLATIONS_KEY,
                             tags).set(cross_package_violation_count)
            self.__reg.gauge(CORE_VIOLATIONS_KEY,
                             tags).set(shared_core_violation_count)
            log.debug("Reported metrics")

        except:
            log.exception("Failed to report metric")
Ejemplo n.º 24
0
def __schedule_loop():
    while True:
        log.debug("Running pending scheduled tasks.")
        schedule.run_pending()

        sleep_time = SCHEDULING_SLEEP_INTERVAL
        if schedule.next_run() is not None:
            sleep_time = schedule.idle_seconds()

        if sleep_time < 0:
            sleep_time = SCHEDULING_SLEEP_INTERVAL

        log.debug("Scheduling thread sleeping for: '{}' seconds".format(sleep_time))
        time.sleep(sleep_time)
Ejemplo n.º 25
0
    def __update_workload(self, func, arg, workload_id):
        try:
            with self.__lock:
                log.debug("Acquired lock for func: {} on workload: {}".format(func.__name__, workload_id))
                start_time = time.time()
                func(arg)
                stop_time = time.time()
                self.__allocator_call_duration_sum_secs = stop_time - start_time

            log.debug("Released lock for func: {} on workload: {}".format(func.__name__, workload_id))
            return True
        except:
            self.__error_count += 1
            log.exception("Failed to execute func: {} on workload: {}".format(func.__name__, workload_id))
            return False
Ejemplo n.º 26
0
def get_predictions(client_cert_path: str, client_key_path: str, url: str,
                    body: dict) -> Optional[dict]:
    log.debug("url: %s, body: %s", url, body)
    response = requests.get(url,
                            json=body,
                            cert=(client_cert_path, client_key_path),
                            verify=False)
    if response.status_code != 200:
        log.error("Failed to query resource prediction service: %s",
                  response.content)
        return None

    resp_bytes = response.content
    resp_str = resp_bytes.decode('utf8')
    resp_json = json.loads(resp_str.strip())
    return resp_json
    def __process_events(self):
        while True:
            try:
                msg = self.__q.get()
                log.debug("Sending event log message: {}".format(msg))
                response = send_event_msg(msg, self.__address)

                if response.status_code != 200:
                    log.error("Re-enqueuing failed event log message: {}".format(response.content))
                    self.__retry_msg_count += 1
                    self.__q.put_nowait(msg)
                else:
                    self.__succeeded_msg_count += 1
            except:
                self.__failed_msg_count += 1
                log.exception("Failed to process event log message.")
Ejemplo n.º 28
0
    def report_metrics(self, tags):
        if self.__registry is None:
            log.debug("Not reporting metrics because there's no registry available yet.")
            return

        wm = get_workload_manager()
        if wm is None:
            log.debug("Not reporting metrics because there's no workload manager available yet.")
            return

        usage = self.get_cpu_usage(60, 60)
        static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage)
        burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage)

        self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage)
        self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
Ejemplo n.º 29
0
def _schedule_once(exit_handler: ExitHandler) -> float:
    try:
        log.debug("Running pending scheduled tasks.")
        schedule.run_pending()

        sleep_time = SCHEDULING_SLEEP_INTERVAL
        if schedule.next_run() is not None:
            sleep_time = schedule.idle_seconds()

        if sleep_time < 0:
            sleep_time = SCHEDULING_SLEEP_INTERVAL

        return sleep_time
    except Exception:
        log.error("Failed to run scheduling once")
        exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
Ejemplo n.º 30
0
    def free_threads(self,
                     request: AllocateThreadsRequest) -> AllocateResponse:
        url = "{}/free_threads".format(self.__url)
        body = request.to_dict()
        log.debug("url: {}, body: {}".format(url, body))
        response = requests.put(url,
                                json=body,
                                headers=self.__headers,
                                timeout=self.__solver_max_runtime_secs)
        log.debug("free_threads response code: {}".format(
            response.status_code))

        if response.status_code == 200:
            return deserialize_response(response.headers, response.json())

        raise CpuAllocationException("Failed to free threads: {}".format(
            response.text))