def get_cpu_models(): bucket_name = get_cpu_model_bucket_name() if bucket_name is None: log.error("Failed to get cpu model bucket name.") return None prefix_name = get_cpu_model_prefix_name() if prefix_name is None: log.error("Failed to get cpu model prefix name.") return None log.info("Getting model metadata from bucket: '{}', prefix: '{}'".format( bucket_name, prefix_name)) s3_client = boto3.client('s3') paginator = s3_client.get_paginator('list_objects') pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix_name) CONTENTS = 'Contents' models = [] for page in pages: if CONTENTS in page: for entry in page[CONTENTS]: models.append(entry) return models
def __init__(self, raw: dict): self.raw = raw self.model_version = raw.get(MODEL_VERSION, "UNKNOWN_MODEL_VERSION") self.model_instance_id = raw.get(MODEL_INSTANCE_ID, "UNKNOWN_MODEL_INSTANCE_ID") self.prediction_ts_ms = int(raw.get(PREDICTION_TS_MS, '0')) self.metadata = raw.get(META_DATA, {}) if self.metadata is None: self.metadata = {} self.__predictions = {} self.__pred_time2empty_batch = [] preds = raw.get(PREDICTIONS) if preds is not None: for p in preds: job_id = p.get(JOB_ID, "UNKNOWN_JOB_ID") self.__predictions[job_id] = ResourceUsagePrediction(p) if PRED_TIME2EMPTY_BATCH in self.metadata: content = self.metadata[PRED_TIME2EMPTY_BATCH] try: self.__pred_time2empty_batch = [ float(e) for e in content.split(',') ] self.metadata.pop(PRED_TIME2EMPTY_BATCH) except Exception: log.error("Error parsing pred_time2empty_batch str: " + content)
def get_workload_from_kubernetes(identifier) -> Optional[KubernetesWorkload]: if not managers_are_initialized(): log.error( "Cannot get workload from kubernetes because managers aren't initialized" ) return None retry_count = get_config_manager().get_int( GET_WORKLOAD_RETRY_COUNT, DEFAULT_GET_WORKLOAD_RETRY_COUNT) retry_interval = get_config_manager().get_float( GET_WORKLOAD_RETRY_INTERVAL_SEC, DEFAULT_GET_WORKLOAD_RETRY_INTERVAL_SEC) pod_manager = get_pod_manager() for i in range(retry_count): log.info("Getting pod from kubernetes: %s", identifier) pod = pod_manager.get_pod(identifier) if pod is not None: log.info("Got pod from kubernetes: %s", identifier) return KubernetesWorkload(pod) log.info("Retrying getting pod from kubernetes in %s seconds", retry_interval) time.sleep(retry_interval) log.error("Failed to get pod from kubernetes: %s", identifier) return None
def get_cpu_predictions( self, workloads: List[Workload], resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]: pod_manager = get_pod_manager() if pod_manager is None: return None pods = [] for w in workloads: pod = pod_manager.get_pod(w.get_id()) if pod is None: log.warning("Failed to get pod for workload: %s", w.get_id()) else: pods.append(pod) resource_usage_predictions = self.get_predictions(pods, resource_usage) predictions = {} if resource_usage_predictions is None: log.error("Got no resource usage predictions") return predictions else: log.info("Got resource usage predictions: %s", json.dumps(resource_usage_predictions.raw)) for w_id, prediction in resource_usage_predictions.predictions.items(): predictions[w_id] = get_first_window_cpu_prediction(prediction) return predictions
def __get_simple_cpu_predictions(self) -> Dict[str, float]: cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor() if cpu_predictor is None: log.error("Failed to get cpu predictor") return {} workloads = self.__workload_manager.get_workloads() if len(workloads) == 0: log.warning("No workloads, skipping cpu usage prediction") return {} workload_ids = [w.get_id() for w in workloads] resource_usage = self.__workload_monitor_manager.get_resource_usage( workload_ids) log.info("Getting simple cpu predictions...") cpu_predictions = cpu_predictor.get_cpu_predictions( workloads, resource_usage) if cpu_predictions is None: log.error("Failed to get cpu predictions") return {} else: log.info("Got simple cpu predictions: %s", json.dumps(cpu_predictions)) return cpu_predictions
def __update_workload(self, func, arg, workload_id): try: with self.__lock: log.debug("Acquired lock for func: {} on workload: {}".format( func.__name__, workload_id)) start_time = time.time() func(arg) stop_time = time.time() if self.__reg is not None: self.__reg.distribution_summary( self.__get_workload_processing_metric_name( func.__name__), self.__tags).record(stop_time - start_time) self.__reg.distribution_summary( WORKLOAD_PROCESSING_DURATION, self.__tags).record(stop_time - start_time) log.debug("Released lock for func: {} on workload: {}".format( func.__name__, workload_id)) return True except Exception: self.__error_count += 1 log.error("Failed to execute func: {} on workload: {}".format( func.__name__, workload_id)) return False
def get_required_property(key): value = get_config_manager().get_str(key) if value is None: log.error("Failed to retrieve property: '{}'".format(key)) return None return value
def _handle(self, event): try: if not self.__relevant(event): return if not managers_are_initialized(): log.warning("Managers are not yet initialized") return None self.handling_event(event, 'oversubscribing workloads') with self.__window_lock: if datetime.utcnow() < self.__window_end_time: self.__skip_count += 1 self.handled_event( event, 'skipping oversubscribe - a window is currently active' ) return self.__publish_window(event) except Exception: self.__fail_count += 1 log.error( "Event handler: '{}' failed to handle event: '{}'".format( self.__class__.__name__, event))
def get_cpu_predictions( self, workloads: List[Workload], resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]: pods = [] for w in workloads: if w.get_object_type() is not KubernetesWorkload: log.warning( "Cannot predict non Kubernetes workload %s: %s is not %s", w.get_id(), w.get_object_type(), KubernetesWorkload) continue pods.append(w.get_pod()) resource_usage_predictions = self.get_predictions(pods, resource_usage) predictions = {} if resource_usage_predictions is None: log.error("Got no resource usage predictions") return predictions else: log.info("Got resource usage predictions: %s", json.dumps(resource_usage_predictions.raw)) for w_id, prediction in resource_usage_predictions.predictions.items(): predictions[w_id] = get_first_window_cpu_prediction(prediction) return predictions
def populate_from_capacity_env(self): self.cpu = 0 self.mem = 0 self.disk = 0 self.net = 0 unknown = "UNKNOWN" instance_type = os.environ.get(EC2_INSTANCE_TYPE, unknown) if instance_type == unknown: log.error("Instance type environment variable not present: %s", EC2_INSTANCE_TYPE) return if instance_type not in machine_types: log.error("Unexpected instance type encountered: %s", instance_type) return machine = machine_types[instance_type] self.cpu = machine[CPU] self.mem = machine[MEM] self.disk = machine[DISK] self.net = machine[NET] self.gpu = machine[GPU] log.info("Loaded node capacity: %s", self.to_dict())
def __process(self, request: AllocateRequest, req_type: str, is_delete: bool) -> AllocateResponse: req_wid = '' if isinstance(request, AllocateThreadsRequest): req_wid = request.get_workload_id() req = self.__build_base_req(request.get_cpu()) req.metadata[ REQ_TYPE_METADATA_KEY] = req_type # for logging purposes server side for wid, w in request.get_workloads().items(): req.task_to_job_id[wid] = w.get_job_id() if is_delete and wid == req_wid: continue req.tasks_to_place.append(wid) try: log.info("remote %s (tasks_to_place=%s)", req_type, req.tasks_to_place) response = self.__stub.ComputeIsolation( req, timeout=self.__call_timeout_secs) except grpc.RpcError as e: log.error("remote %s failed (tasks_to_place=%s):\n%s", req_type, req.tasks_to_place, repr(e)) raise e try: return self.__deser(response) except Exception as e: log.error("failed to deseralize response for remote %s of %s:\n%s", req_type, req_wid, repr(e)) raise e
def __process_events(self): while not self.__stopped: try: event = self.__q.get(timeout=self.__event_timeout) dequeue_time = time.time() log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth())) if self.__reg is not None: self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment() self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment() self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY]) except Empty: log.debug("Timed out waiting for event on queue.") continue for event_handler in self.__event_handlers: try: log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION])) event_handler.handle(event) self.__report_succeeded_event(event_handler) except Exception: log.error("Event handler: '{}' failed to handle event: '{}'".format( type(event_handler).__name__, event)) self.__report_failed_event(event_handler) self.__q.task_done() self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment() self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth()) self.__processed_count += 1
def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse: url = "{}/free_threads".format(self.__url) body = request.to_dict() try: log.info("freeing threads remotely for workload: %s", request.get_workload_id()) response = requests.put(url, json=body, headers=self.__headers, timeout=self.__timeout) except requests.exceptions.Timeout as e: log.error("freeing threads remotely for workload: %s timed out", request.get_workload_id()) raise e if response.status_code == 200: log.info( "freed threads remotely with response code: %s for workload: %s", response.status_code, request.get_workload_id()) return deserialize_response(response.headers, response.json()) log.error( "failed to free threads remotely for workload: %s with status code: %d", request.get_workload_id(), response.status_code) raise CpuAllocationException("Failed to free threads: {}".format( response.text))
def get_name(self) -> str: url = "{}/cpu_allocator".format(self.__url) try: response = requests.get(url, timeout=self.__timeout) return "Remote({})".format(response.text) except Exception: log.error("Failed to GET cpu allocator name.") return "Remote({})".format(UNKNOWN_CPU_ALLOCATOR)
def __get_allocator_class(allocator_str): if allocator_str not in CPU_ALLOCATORS: log.error( "Unexpected CPU allocator specified: '{}', falling back to default: '{}'" .format(allocator_str, DEFAULT_ALLOCATOR)) allocator_str = DEFAULT_ALLOCATOR return CPU_ALLOCATOR_NAME_TO_CLASS_MAP[allocator_str]
def __get_image(container): if REPO_DIGESTS in container.image.attrs: repo_digests = container.image.attrs[REPO_DIGESTS] if len(repo_digests) > 0: return repo_digests[0] log.error("Failed to extract image from container: '{}'".format( container.name)) return ''
def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage: try: global_usage = GlobalResourceUsage(self.__get_usage_dict(workload_ids)) log.debug("Got resource usage: %s", json.dumps(global_usage.serialize(), sort_keys=True, separators=(',', ':'))) return global_usage except Exception: log.error("failed to get resource usage, returning empty usage") with self.__metric_lock: self.__get_resource_usage_failure_count += 1 return GlobalResourceUsage({})
def report_event(self, payload: dict): try: payload['ts'] = str(datetime.datetime.utcnow()) event = { "uuid": str(uuid.uuid4()), "payload": payload } msg = get_event_msg(event) self.__q.put_nowait(msg) except Exception: self.__failed_msg_count += 1 log.error("Failed to report event for payload: {}".format(payload))
def __schedule_loop(exit_handler: ExitHandler): log.info("Starting scheduling loop...") while True: try: sleep_time = _schedule_once(exit_handler) _notify_watchdog() log.debug("Scheduling thread sleeping for: '%d' seconds", sleep_time) time.sleep(sleep_time) except Exception: log.error("Failed to run scheduling loop") exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
def get_duration_predictions(input_str: str) -> List[DurationPrediction]: try: # "0.05=0.29953;0.1=0.29953;0.15=0.29953;0.2=0.29953;0.25=0.29953;0.3=0.29953;0.35=0.29953;0.4=0.29953;0.45=0.29953;0.5=0.29953;0.55=0.29953;0.6=0.29953;0.65=0.29953;0.7=0.29953;0.75=0.29953;0.8=0.29953;0.85=0.29953;0.9=0.29953;0.95=0.29953" duration_predictions = [] pairs = input_str.split(';') for p in pairs: k, v = p.split('=') duration_predictions.append(DurationPrediction(float(k), float(v))) return duration_predictions except Exception: log.error( "Failed to parse duration predictions: '{}'".format(input_str)) return []
def rebalance(self, request: AllocateRequest) -> AllocateResponse: try: self.__primary_rebalance_call_count += 1 self.__should_fallback_immediately() return self.__primary_allocator.rebalance(request) except Exception as e: log.error( "Failed to rebalance workloads: '{}' with primary allocator: '{}', falling back to: '{}' because '{}'".format( [w.get_id() for w in request.get_workloads().values()], self.__primary_allocator.__class__.__name__, self.__secondary_allocator.__class__.__name__, e)) self.__secondary_rebalance_call_count += 1 return self.__secondary_allocator.rebalance(request)
def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse: try: self.__primary_free_threads_call_count += 1 self.__should_fallback_immediately() return self.__primary_allocator.free_threads(request) except Exception as e: log.error( "Failed to free threads for workload: '{}' with primary allocator: '{}', falling back to: '{}' because '{}'".format( request.get_workload_id(), self.__primary_allocator.__class__.__name__, self.__secondary_allocator.__class__.__name__, e)) self.__secondary_free_threads_call_count += 1 return self.__secondary_allocator.free_threads(request)
def get_predictions( self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage ) -> Optional[ResourceUsagePredictions]: config_manager = get_config_manager() if config_manager is None: log.warning("Config manager not yet set.") return None client_crt = get_client_cert_path(config_manager) client_key = get_client_key_path(config_manager) if client_crt is None or client_key is None: log.error("Failed to generate credential paths") return None url = get_url(config_manager) if url is None: log.error("Unable to generate prediction service url") return None body = self.__get_body(running_pods, resource_usage) if body is None: log.error("Unable to generate a prediction request body") return None predictions = get_predictions(client_crt, client_key, url, body) if predictions is None: log.error("Failed to get predictions") return None return ResourceUsagePredictions(predictions)
def __remove_workload(self, workload_id): log.info("Removing workload: {}".format(workload_id)) if workload_id not in self.__workloads: log.error("Attempted to remove unknown workload: '{}'".format(workload_id)) return workload_map = self.get_workload_map_copy() request = self.__get_threads_request(workload_id, workload_map, "free") response = self.__cpu_allocator.free_threads(request) workload_map.pop(workload_id) self.__update_state(response.get_cpu(), workload_map) report_cpu_event(request, response)
def _handle(self, event): try: if not self.__relevant(event): self.ignored_event(event, "irrelevant") return with self.__publish_lock: self.__publisher.publish() with self.__metric_lock: self.__publish_success_count += 1 except Exception: with self.__metric_lock: self.__publish_failure_count += 1 log.error("Failed to publish resource usage predictions")
def init(): # Initialize currently running containers as workloads log.info("Isolating currently running workloads...") for workload in get_current_workloads(docker.from_env()): try: workload_manager.add_workload(workload) except Exception: log.error( "Failed to add currently running workload: '{}', maybe it exited." .format(workload.get_id())) log.info("Isolated currently running workloads.") # Start processing events after adding running workloads to avoid processing a die event before we add a workload event_manager.start_processing_events() _notify_ready()
def get_free_threads( self, cpu: Cpu, workload_map: Dict[str, Workload], cpu_usage: Dict[str, float] = None) -> List[Thread]: if cpu_usage is None: log.error("CPU usage is required, defaulting to EMPTY threads being free.") return cpu.get_empty_threads() free_threads = [] for c in get_free_cores(self.__threshold, cpu, workload_map, cpu_usage): free_threads += c.get_threads() return free_threads
def get_allocator(allocator_str, config_manager): if allocator_str not in CPU_ALLOCATORS: log.error( "Unexpected CPU allocator specified: '{}', falling back to default: '{}'" .format(allocator_str, DEFAULT_ALLOCATOR)) allocator_str = DEFAULT_ALLOCATOR free_thread_provider = get_free_thread_provider(config_manager) if allocator_str != FORECAST_CPU_IP: return CPU_ALLOCATOR_NAME_TO_CLASS_MAP[allocator_str]( free_thread_provider) return ForecastIPCpuAllocator( cpu_usage_predictor_manager=get_cpu_usage_predictor_manager(), config_manager=config_manager, free_thread_provider=free_thread_provider)
def _schedule_once(exit_handler: ExitHandler) -> float: try: log.debug("Running pending scheduled tasks.") schedule.run_pending() sleep_time = SCHEDULING_SLEEP_INTERVAL if schedule.next_run() is not None: sleep_time = schedule.idle_seconds() if sleep_time < 0: sleep_time = SCHEDULING_SLEEP_INTERVAL return sleep_time except Exception: log.error("Failed to run scheduling once") exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
def download_latest_cpu_model(path=get_cpu_model_file_path()): log.info("Downloading latest cpu prediction model.") latest_model = get_latest_cpu_model() if latest_model is None: log.error("Failed to download model because no model found.") return bucket_name = get_cpu_model_bucket_name() key = latest_model['Key'] s3_client = boto3.client('s3') log.info( "Downloading latest cpu prediction model: '{}/{}' to: '{}'".format( bucket_name, key, path)) s3_client.download_file(bucket_name, key, path) log.info("Downloaded latest cpu prediction model: '{}/{}' to: '{}'".format( bucket_name, key, path))