def __update_workload(self, func, arg, workload_id): try: with self.__lock: log.debug("Acquired lock for func: {} on workload: {}".format( func.__name__, workload_id)) start_time = time.time() func(arg) stop_time = time.time() if self.__reg is not None: self.__reg.distribution_summary( self.__get_workload_processing_metric_name( func.__name__), self.__tags).record(stop_time - start_time) self.__reg.distribution_summary( WORKLOAD_PROCESSING_DURATION, self.__tags).record(stop_time - start_time) log.debug("Released lock for func: {} on workload: {}".format( func.__name__, workload_id)) return True except: self.__error_count += 1 log.exception("Failed to execute func: {} on workload: {}".format( func.__name__, workload_id)) return False
def assign_threads(self, workload): thread_count = workload.get_thread_count() claimed_threads = [] if thread_count == 0: return claimed_threads log.info("Assigning '{}' thread(s) to workload: '{}'".format(workload.get_thread_count(), workload.get_id())) if is_cpu_full(self.__cpu): raise ValueError("Cannot assign workload: '{}' to full CPU.".format(workload.get_id())) package = self.__cpu.get_emptiest_package() while thread_count > 0 and len(package.get_empty_threads()) > 0: core = get_emptiest_core(package) empty_threads = core.get_empty_threads()[:thread_count] for empty_thread in empty_threads: log.debug("Claiming package:core:thread '{}:{}:{}' for workload '{}'".format( package.get_id(), core.get_id(), empty_thread.get_id(), workload.get_id())) empty_thread.claim(workload.get_id()) claimed_threads.append(empty_thread) thread_count -= 1 return claimed_threads + self.assign_threads(Workload(workload.get_id(), thread_count, workload.get_type()))
def __process_events(self): while not self.__stopped: try: event = self.__q.get(timeout=self.__event_timeout) dequeue_time = time.time() log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth())) if self.__reg is not None: self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment() self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment() self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY]) except Empty: log.debug("Timed out waiting for event on queue.") continue for event_handler in self.__event_handlers: try: log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION])) event_handler.handle(event) self.__report_succeeded_event(event_handler) except Exception: log.error("Event handler: '{}' failed to handle event: '{}'".format( type(event_handler).__name__, event)) self.__report_failed_event(event_handler) self.__q.task_done() self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment() self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth()) self.__processed_count += 1
def __get_rebalance_request(self): workload_map = self.get_workload_map_copy() resource_usage = self.__wmm.get_resource_usage(workload_map.keys()) log.debug("resource_usage: %s", json.dumps(resource_usage.serialize())) cpu_usage = self.__get_optional_default(resource_usage.get_cpu_usage, {}) mem_usage = self.__get_optional_default(resource_usage.get_mem_usage, {}) net_recv_usage = self.__get_optional_default( resource_usage.get_net_recv_usage, {}) net_trans_usage = self.__get_optional_default( resource_usage.get_net_trans_usage, {}) disk_usage = self.__get_optional_default(resource_usage.get_disk_usage, {}) return AllocateRequest( cpu=self.get_cpu_copy(), workloads=workload_map, resource_usage=resource_usage, cpu_usage=cpu_usage, mem_usage=mem_usage, net_recv_usage=net_recv_usage, net_trans_usage=net_trans_usage, disk_usage=disk_usage, metadata=self.__get_request_metadata("rebalance"))
def __get_workloads(): wm = get_workload_manager() if wm is None: log.debug("Workload manager not yet present.") return [] return wm.get_workloads()
def __predict_usage(self, workloads, cpu_usage): res = {} cpu_usage_predictor = self.__get_cpu_usage_predictor() cm = self.__config_manager pred_env = PredEnvironment(cm.get_region(), cm.get_environment(), dt.utcnow().hour) start_time = time.time() for w in workloads.values(): # TODO: batch the call # TODO: Integrate new prediction service pred = w.get_thread_count() if type(cpu_usage_predictor) is CpuUsagePredictor: pred = cpu_usage_predictor.predict( w, cpu_usage.get(w.get_id(), None), pred_env) log.info("Predicted cpu usage: %s for workload: %s", pred, w.get_id()) else: log.info("Not predicting cpu usage for workload: %s", w.get_id()) res[w.get_id()] = pred stop_time = time.time() self.__call_meta['pred_cpu_usage_dur_secs'] = stop_time - start_time try: self.__call_meta[ 'pred_cpu_usage_model_id'] = cpu_usage_predictor.get_model( ).meta_data['model_training_titus_task_id'] except Exception: self.__call_meta['pred_cpu_usage_model_id'] = 'unknown' log.debug("Usage prediction per workload: " + str(res)) if len(res) > 0: self.__call_meta['pred_cpu_usage'] = dict(res) return res
def report_metrics(self, tags): if self.__registry is None: log.debug( "Not reporting metrics because there's no registry available yet." ) return wm = get_workload_manager() if wm is None: log.debug( "Not reporting metrics because there's no workload manager available yet." ) return pcp_usage = self.get_pcp_usage() if CPU_USAGE not in pcp_usage.keys(): log.warning("No CPU usage in PCP usage.") return usage = pcp_usage[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
def report_metrics(self, tags): if self.__registry is None: log.debug("Not reporting metrics because there's no registry available yet.") return wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return workload_ids = wm.get_workload_map_copy().keys() usage_dict = self.__get_usage_dict(workload_ids) if CPU_USAGE not in usage_dict.keys(): log.warning("No CPU usage in usage: %s", usage_dict) return usage = usage_dict[CPU_USAGE] static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage) with self.__metric_lock: self.__registry.counter(GET_RESOURCE_USAGE_FAILURE, tags).increment(self.__get_resource_usage_failure_count) self.__get_resource_usage_failure_count = 0
def test_fill_cpu(self): """ Workload 0: 8 cores Workload 1: 4 cores Workload 2: 2 cores Workload 3: 1 core Workload 4: 1 core -------------------- Total: 16 cores """ for allocator in ALLOCATORS: cpu = get_cpu() workloads = [ get_test_workload("a", 8, STATIC), get_test_workload("b", 4, STATIC), get_test_workload("c", 2, STATIC), get_test_workload("d", 1, STATIC), get_test_workload("e", 1, STATIC) ] tot_req = 0 workload_map = {} for w in workloads: workload_map[w.get_id()] = w request = AllocateThreadsRequest( cpu, w.get_id(), workload_map, {}, DEFAULT_TEST_REQUEST_METADATA) cpu = allocator.assign_threads(request).get_cpu() log.debug("{}".format(cpu)) tot_req += w.get_thread_count() self.assertEqual(tot_req, len(cpu.get_claimed_threads()))
def add_window(self, start: datetime, end: datetime, free_cpu_count: int): node = self.__get_node() log.debug('owner_kind:%s owner_name:%s owner_uid:%s', node.kind, node.metadata.name, node.metadata.uid) start_epoch_ms = int(unix_time_millis(start)) end_epoch_ms = int(unix_time_millis(end)) oppo_meta = V1ObjectMeta( namespace=OPPORTUNISTIC_RESOURCE_NAMESPACE, name="{}-{}-{}".format(node.metadata.name, start_epoch_ms, end_epoch_ms), labels={ OPPORTUNISTIC_RESOURCE_NODE_NAME_LABEL_KEY: node.metadata.name, OPPORTUNISTIC_RESOURCE_NODE_UID_LABEL_KEY: node.metadata.uid }, owner_references=[ V1OwnerReference(api_version=node.api_version, kind=node.kind, name=node.metadata.name, uid=node.metadata.uid) ]) oppo_spec = OpportunisticResourceSpec( capacity=OpportunisticResourceCapacity(free_cpu_count), window=OpportunisticResourceWindow(start_epoch_ms, end_epoch_ms)) oppo_body = OpportunisticResource(metadata=oppo_meta, spec=oppo_spec) oppo = self.__custom_api.create_namespaced_custom_object( version=OPPORTUNISTIC_RESOURCE_VERSION, group=OPPORTUNISTIC_RESOURCE_GROUP, plural=OPPORTUNISTIC_RESOURCE_PLURAL, namespace=OPPORTUNISTIC_RESOURCE_NAMESPACE, body=oppo_body) log.debug('created window: %s', json.dumps(oppo))
def set_cpuset(self, container_name, thread_ids): log.debug("Updating container: '{}' to cpuset: '{}'".format(container_name, thread_ids)) self.container_update_map[container_name] = thread_ids if container_name not in self.container_update_counts: self.container_update_counts[container_name] = 1 else: self.container_update_counts[container_name] += 1
def __init__(self, total_threshold: float): """ This class determines whether threads are free based on the cpu usage of workloads. :param total_threshold: The percentage of usage under which threads are considered to be free. """ self.__threshold = total_threshold log.debug("{} created with threshold: '{}'".format(self.__class__.__name__, self.__threshold))
def __set(self, func: FunctionType, container_name: str, value: str): try: func(container_name, value) self.__write_succeeded(container_name) except Exception: self.__write_failed() log.debug("Failed to apply func: {} with value: {} to container: {}".format( func.__name__, value, container_name))
def get_latest_cpu_model(): models = get_cpu_models() if models is None or len(models) == 0: return None models = sorted(models, key=lambda e: e['LastModified'], reverse=True) log.debug("sorted models: {}".format(models)) return models[0]
def is_window_active(self) -> bool: oppo_list = self.__get_scoped_opportunistic_resources() log.debug('is active: oppo list: %s', json.dumps(oppo_list)) for item in oppo_list['items']: log.debug('checking for window: %s', json.dumps(item)) now = datetime.utcnow() if now < self.__get_timestamp(item['spec']['window']['end']): return True return False
def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage: try: global_usage = GlobalResourceUsage(self.__get_usage_dict(workload_ids)) log.debug("Got resource usage: %s", json.dumps(global_usage.serialize(), sort_keys=True, separators=(',', ':'))) return global_usage except Exception: log.error("failed to get resource usage, returning empty usage") with self.__metric_lock: self.__get_resource_usage_failure_count += 1 return GlobalResourceUsage({})
def rebalance(self, request: AllocateRequest) -> AllocateResponse: url = "{}/rebalance".format(self.__url) body = request.to_dict() log.debug("url: {}, body: {}".format(url, body)) response = requests.put(url, json=body, headers=self.__headers, timeout=self.__timeout) log.debug("rebalance response code: {}".format(response.status_code)) if response.status_code == 200: return deserialize_response(response.headers, response.json()) raise CpuAllocationException("Failed to rebalance threads: {}".format(response.text))
def is_window_active(self) -> bool: with self.__lock: log.debug('is active: oppo list: %s', json.dumps(self.__opportunistic_resources)) for item in self.__opportunistic_resources.values(): log.debug('checking for window: %s', json.dumps(item)) now = datetime.utcnow() if now < self.__get_timestamp( item['object']['spec']['window']['end']): return True return False
def __schedule_loop(exit_handler: ExitHandler): log.info("Starting scheduling loop...") while True: try: sleep_time = _schedule_once(exit_handler) _notify_watchdog() log.debug("Scheduling thread sleeping for: '%d' seconds", sleep_time) time.sleep(sleep_time) except Exception: log.error("Failed to run scheduling loop") exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
def _wait_for_file_to_exist(path, timeout, check_func=__noop): start_time = time.time() while not os.path.exists(path): log.debug("Waiting for file to exist: '{}'".format(path)) time.sleep(0.1) check_func() elapsed_time = time.time() - start_time if elapsed_time > timeout: raise TimeoutError( "Expected file '{}' was not created in '{}' seconds.".format(path, timeout))
def __detect_property_changes(self): for p in self.__properties: original_value = self.__original_properties[p] curr_value = self.__config_manager.get_str(p) log.debug("property: '{}' original: '{}' current: '{}'".format( p, original_value, curr_value)) if original_value != curr_value: log.info( "Restarting because property: '{}' changed from: '{}' to: '{}'" .format(p, original_value, curr_value)) self.__exit_handler.exit(GENERIC_PROPERTY_CHANGE_EXIT)
def __groom_events(self): while not self.__stopped: try: event = self.__raw_q.get(timeout=self.__event_timeout) except Empty: log.debug("Timed out waiting for event on queue.") continue event = event.decode("utf-8") event = json.loads(event) self.__event_logger.handle(event) Thread(target=self.__groom, args=[event]).start()
def report_metrics(self, tags): log.debug("Reporting metrics") try: # Workload manager metrics self.__reg.gauge(RUNNING, tags).set(1) self.__reg.gauge(ADDED_KEY, tags).set( self.__workload_manager.get_added_count()) self.__reg.gauge(REMOVED_KEY, tags).set( self.__workload_manager.get_removed_count()) self.__reg.gauge(SUCCEEDED_KEY, tags).set( self.__workload_manager.get_success_count()) self.__reg.gauge(FAILED_KEY, tags).set( self.__workload_manager.get_error_count()) self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set( len(self.__workload_manager.get_workloads())) # Allocator metrics self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set( self.__workload_manager.get_allocator_call_duration_sum_secs()) self.__reg.gauge(FALLBACK_ALLOCATOR_COUNT, tags).set( self.__workload_manager.get_fallback_allocator_calls_count()) self.__reg.gauge(IP_ALLOCATOR_TIMEBOUND_COUNT, tags).set( self.__workload_manager. get_time_bound_ip_allocator_solution_count()) # Event manager metrics self.__reg.gauge(QUEUE_DEPTH_KEY, tags).set(self.__event_manager.get_queue_depth()) self.__reg.gauge(EVENT_SUCCEEDED_KEY, tags).set( self.__event_manager.get_success_count()) self.__reg.gauge(EVENT_FAILED_KEY, tags).set(self.__event_manager.get_error_count()) self.__reg.gauge(EVENT_PROCESSED_KEY, tags).set( self.__event_manager.get_processed_count()) # CPU metrics cross_package_violation_count = len( get_cross_package_violations( self.__workload_manager.get_cpu())) shared_core_violation_count = len( get_shared_core_violations(self.__workload_manager.get_cpu())) self.__reg.gauge(PACKAGE_VIOLATIONS_KEY, tags).set(cross_package_violation_count) self.__reg.gauge(CORE_VIOLATIONS_KEY, tags).set(shared_core_violation_count) log.debug("Reported metrics") except: log.exception("Failed to report metric")
def __schedule_loop(): while True: log.debug("Running pending scheduled tasks.") schedule.run_pending() sleep_time = SCHEDULING_SLEEP_INTERVAL if schedule.next_run() is not None: sleep_time = schedule.idle_seconds() if sleep_time < 0: sleep_time = SCHEDULING_SLEEP_INTERVAL log.debug("Scheduling thread sleeping for: '{}' seconds".format(sleep_time)) time.sleep(sleep_time)
def __update_workload(self, func, arg, workload_id): try: with self.__lock: log.debug("Acquired lock for func: {} on workload: {}".format(func.__name__, workload_id)) start_time = time.time() func(arg) stop_time = time.time() self.__allocator_call_duration_sum_secs = stop_time - start_time log.debug("Released lock for func: {} on workload: {}".format(func.__name__, workload_id)) return True except: self.__error_count += 1 log.exception("Failed to execute func: {} on workload: {}".format(func.__name__, workload_id)) return False
def get_predictions(client_cert_path: str, client_key_path: str, url: str, body: dict) -> Optional[dict]: log.debug("url: %s, body: %s", url, body) response = requests.get(url, json=body, cert=(client_cert_path, client_key_path), verify=False) if response.status_code != 200: log.error("Failed to query resource prediction service: %s", response.content) return None resp_bytes = response.content resp_str = resp_bytes.decode('utf8') resp_json = json.loads(resp_str.strip()) return resp_json
def __process_events(self): while True: try: msg = self.__q.get() log.debug("Sending event log message: {}".format(msg)) response = send_event_msg(msg, self.__address) if response.status_code != 200: log.error("Re-enqueuing failed event log message: {}".format(response.content)) self.__retry_msg_count += 1 self.__q.put_nowait(msg) else: self.__succeeded_msg_count += 1 except: self.__failed_msg_count += 1 log.exception("Failed to process event log message.")
def report_metrics(self, tags): if self.__registry is None: log.debug("Not reporting metrics because there's no registry available yet.") return wm = get_workload_manager() if wm is None: log.debug("Not reporting metrics because there's no workload manager available yet.") return usage = self.get_cpu_usage(60, 60) static_pool_cpu_usage = self.__get_pool_usage(STATIC, usage) burst_pool_cpu_usage = self.__get_pool_usage(BURST, usage) self.__registry.gauge(STATIC_POOL_USAGE_KEY, tags).set(static_pool_cpu_usage) self.__registry.gauge(BURST_POOL_USAGE_KEY, tags).set(burst_pool_cpu_usage)
def _schedule_once(exit_handler: ExitHandler) -> float: try: log.debug("Running pending scheduled tasks.") schedule.run_pending() sleep_time = SCHEDULING_SLEEP_INTERVAL if schedule.next_run() is not None: sleep_time = schedule.idle_seconds() if sleep_time < 0: sleep_time = SCHEDULING_SLEEP_INTERVAL return sleep_time except Exception: log.error("Failed to run scheduling once") exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse: url = "{}/free_threads".format(self.__url) body = request.to_dict() log.debug("url: {}, body: {}".format(url, body)) response = requests.put(url, json=body, headers=self.__headers, timeout=self.__solver_max_runtime_secs) log.debug("free_threads response code: {}".format( response.status_code)) if response.status_code == 200: return deserialize_response(response.headers, response.json()) raise CpuAllocationException("Failed to free threads: {}".format( response.text))