Beispiel #1
0
def get_cpu_models():
    bucket_name = get_cpu_model_bucket_name()
    if bucket_name is None:
        log.error("Failed to get cpu model bucket name.")
        return None

    prefix_name = get_cpu_model_prefix_name()
    if prefix_name is None:
        log.error("Failed to get cpu model prefix name.")
        return None

    log.info("Getting model metadata from bucket: '{}', prefix: '{}'".format(
        bucket_name, prefix_name))

    s3_client = boto3.client('s3')
    paginator = s3_client.get_paginator('list_objects')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix_name)

    CONTENTS = 'Contents'
    models = []
    for page in pages:
        if CONTENTS in page:
            for entry in page[CONTENTS]:
                models.append(entry)

    return models
Beispiel #2
0
    def __init__(self, raw: dict):
        self.raw = raw
        self.model_version = raw.get(MODEL_VERSION, "UNKNOWN_MODEL_VERSION")
        self.model_instance_id = raw.get(MODEL_INSTANCE_ID,
                                         "UNKNOWN_MODEL_INSTANCE_ID")
        self.prediction_ts_ms = int(raw.get(PREDICTION_TS_MS, '0'))
        self.metadata = raw.get(META_DATA, {})
        if self.metadata is None:
            self.metadata = {}
        self.__predictions = {}
        self.__pred_time2empty_batch = []

        preds = raw.get(PREDICTIONS)
        if preds is not None:
            for p in preds:
                job_id = p.get(JOB_ID, "UNKNOWN_JOB_ID")
                self.__predictions[job_id] = ResourceUsagePrediction(p)

        if PRED_TIME2EMPTY_BATCH in self.metadata:
            content = self.metadata[PRED_TIME2EMPTY_BATCH]
            try:
                self.__pred_time2empty_batch = [
                    float(e) for e in content.split(',')
                ]
                self.metadata.pop(PRED_TIME2EMPTY_BATCH)
            except Exception:
                log.error("Error parsing pred_time2empty_batch str: " +
                          content)
def get_workload_from_kubernetes(identifier) -> Optional[KubernetesWorkload]:
    if not managers_are_initialized():
        log.error(
            "Cannot get workload from kubernetes because managers aren't initialized"
        )
        return None

    retry_count = get_config_manager().get_int(
        GET_WORKLOAD_RETRY_COUNT, DEFAULT_GET_WORKLOAD_RETRY_COUNT)
    retry_interval = get_config_manager().get_float(
        GET_WORKLOAD_RETRY_INTERVAL_SEC,
        DEFAULT_GET_WORKLOAD_RETRY_INTERVAL_SEC)

    pod_manager = get_pod_manager()
    for i in range(retry_count):
        log.info("Getting pod from kubernetes: %s", identifier)
        pod = pod_manager.get_pod(identifier)
        if pod is not None:
            log.info("Got pod from kubernetes: %s", identifier)
            return KubernetesWorkload(pod)

        log.info("Retrying getting pod from kubernetes in %s seconds",
                 retry_interval)
        time.sleep(retry_interval)

    log.error("Failed to get pod from kubernetes: %s", identifier)
    return None
Beispiel #4
0
    def get_cpu_predictions(
            self, workloads: List[Workload],
            resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]:
        pod_manager = get_pod_manager()
        if pod_manager is None:
            return None

        pods = []
        for w in workloads:
            pod = pod_manager.get_pod(w.get_id())
            if pod is None:
                log.warning("Failed to get pod for workload: %s", w.get_id())
            else:
                pods.append(pod)

        resource_usage_predictions = self.get_predictions(pods, resource_usage)

        predictions = {}
        if resource_usage_predictions is None:
            log.error("Got no resource usage predictions")
            return predictions
        else:
            log.info("Got resource usage predictions: %s",
                     json.dumps(resource_usage_predictions.raw))

        for w_id, prediction in resource_usage_predictions.predictions.items():
            predictions[w_id] = get_first_window_cpu_prediction(prediction)

        return predictions
    def __get_simple_cpu_predictions(self) -> Dict[str, float]:
        cpu_predictor = self.__cpu_usage_predictor_manager.get_cpu_predictor()
        if cpu_predictor is None:
            log.error("Failed to get cpu predictor")
            return {}

        workloads = self.__workload_manager.get_workloads()
        if len(workloads) == 0:
            log.warning("No workloads, skipping cpu usage prediction")
            return {}

        workload_ids = [w.get_id() for w in workloads]
        resource_usage = self.__workload_monitor_manager.get_resource_usage(
            workload_ids)

        log.info("Getting simple cpu predictions...")
        cpu_predictions = cpu_predictor.get_cpu_predictions(
            workloads, resource_usage)
        if cpu_predictions is None:
            log.error("Failed to get cpu predictions")
            return {}
        else:
            log.info("Got simple cpu predictions: %s",
                     json.dumps(cpu_predictions))
            return cpu_predictions
    def __update_workload(self, func, arg, workload_id):
        try:
            with self.__lock:
                log.debug("Acquired lock for func: {} on workload: {}".format(
                    func.__name__, workload_id))
                start_time = time.time()
                func(arg)
                stop_time = time.time()
                if self.__reg is not None:
                    self.__reg.distribution_summary(
                        self.__get_workload_processing_metric_name(
                            func.__name__),
                        self.__tags).record(stop_time - start_time)
                    self.__reg.distribution_summary(
                        WORKLOAD_PROCESSING_DURATION,
                        self.__tags).record(stop_time - start_time)

            log.debug("Released lock for func: {} on workload: {}".format(
                func.__name__, workload_id))
            return True
        except Exception:
            self.__error_count += 1
            log.error("Failed to execute func: {} on workload: {}".format(
                func.__name__, workload_id))
            return False
Beispiel #7
0
def get_required_property(key):
    value = get_config_manager().get_str(key)
    if value is None:
        log.error("Failed to retrieve property: '{}'".format(key))
        return None

    return value
    def _handle(self, event):
        try:
            if not self.__relevant(event):
                return

            if not managers_are_initialized():
                log.warning("Managers are not yet initialized")
                return None

            self.handling_event(event, 'oversubscribing workloads')

            with self.__window_lock:
                if datetime.utcnow() < self.__window_end_time:
                    self.__skip_count += 1
                    self.handled_event(
                        event,
                        'skipping oversubscribe - a window is currently active'
                    )
                    return

                self.__publish_window(event)

        except Exception:
            self.__fail_count += 1
            log.error(
                "Event handler: '{}' failed to handle event: '{}'".format(
                    self.__class__.__name__, event))
    def get_cpu_predictions(
            self, workloads: List[Workload],
            resource_usage: GlobalResourceUsage) -> Optional[Dict[str, float]]:
        pods = []
        for w in workloads:
            if w.get_object_type() is not KubernetesWorkload:
                log.warning(
                    "Cannot predict non Kubernetes workload %s: %s is not %s",
                    w.get_id(), w.get_object_type(), KubernetesWorkload)
                continue

            pods.append(w.get_pod())

        resource_usage_predictions = self.get_predictions(pods, resource_usage)

        predictions = {}
        if resource_usage_predictions is None:
            log.error("Got no resource usage predictions")
            return predictions
        else:
            log.info("Got resource usage predictions: %s",
                     json.dumps(resource_usage_predictions.raw))

        for w_id, prediction in resource_usage_predictions.predictions.items():
            predictions[w_id] = get_first_window_cpu_prediction(prediction)

        return predictions
    def populate_from_capacity_env(self):
        self.cpu = 0
        self.mem = 0
        self.disk = 0
        self.net = 0

        unknown = "UNKNOWN"
        instance_type = os.environ.get(EC2_INSTANCE_TYPE, unknown)

        if instance_type == unknown:
            log.error("Instance type environment variable not present: %s", EC2_INSTANCE_TYPE)
            return

        if instance_type not in machine_types:
            log.error("Unexpected instance type encountered: %s", instance_type)
            return

        machine = machine_types[instance_type]
        self.cpu = machine[CPU]
        self.mem = machine[MEM]
        self.disk = machine[DISK]
        self.net = machine[NET]
        self.gpu = machine[GPU]

        log.info("Loaded node capacity: %s", self.to_dict())
Beispiel #11
0
    def __process(self, request: AllocateRequest, req_type: str,
                  is_delete: bool) -> AllocateResponse:
        req_wid = ''
        if isinstance(request, AllocateThreadsRequest):
            req_wid = request.get_workload_id()
        req = self.__build_base_req(request.get_cpu())
        req.metadata[
            REQ_TYPE_METADATA_KEY] = req_type  # for logging purposes server side

        for wid, w in request.get_workloads().items():
            req.task_to_job_id[wid] = w.get_job_id()
            if is_delete and wid == req_wid:
                continue
            req.tasks_to_place.append(wid)

        try:
            log.info("remote %s (tasks_to_place=%s)", req_type,
                     req.tasks_to_place)
            response = self.__stub.ComputeIsolation(
                req, timeout=self.__call_timeout_secs)
        except grpc.RpcError as e:
            log.error("remote %s failed (tasks_to_place=%s):\n%s", req_type,
                      req.tasks_to_place, repr(e))
            raise e

        try:
            return self.__deser(response)
        except Exception as e:
            log.error("failed to deseralize response for remote %s of %s:\n%s",
                      req_type, req_wid, repr(e))
            raise e
    def __process_events(self):
        while not self.__stopped:
            try:
                event = self.__q.get(timeout=self.__event_timeout)
                dequeue_time = time.time()
                log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth()))
                if self.__reg is not None:
                    self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment()
                    self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment()
                    self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY])
            except Empty:
                log.debug("Timed out waiting for event on queue.")
                continue

            for event_handler in self.__event_handlers:
                try:
                    log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION]))
                    event_handler.handle(event)
                    self.__report_succeeded_event(event_handler)
                except Exception:
                    log.error("Event handler: '{}' failed to handle event: '{}'".format(
                        type(event_handler).__name__, event))
                    self.__report_failed_event(event_handler)

            self.__q.task_done()
            self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment()
            self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth())
            self.__processed_count += 1
    def free_threads(self,
                     request: AllocateThreadsRequest) -> AllocateResponse:
        url = "{}/free_threads".format(self.__url)
        body = request.to_dict()

        try:
            log.info("freeing threads remotely for workload: %s",
                     request.get_workload_id())
            response = requests.put(url,
                                    json=body,
                                    headers=self.__headers,
                                    timeout=self.__timeout)
        except requests.exceptions.Timeout as e:
            log.error("freeing threads remotely for workload: %s timed out",
                      request.get_workload_id())
            raise e

        if response.status_code == 200:
            log.info(
                "freed threads remotely with response code: %s for workload: %s",
                response.status_code, request.get_workload_id())
            return deserialize_response(response.headers, response.json())

        log.error(
            "failed to free threads remotely for workload: %s with status code: %d",
            request.get_workload_id(), response.status_code)
        raise CpuAllocationException("Failed to free threads: {}".format(
            response.text))
 def get_name(self) -> str:
     url = "{}/cpu_allocator".format(self.__url)
     try:
         response = requests.get(url, timeout=self.__timeout)
         return "Remote({})".format(response.text)
     except Exception:
         log.error("Failed to GET cpu allocator name.")
         return "Remote({})".format(UNKNOWN_CPU_ALLOCATOR)
Beispiel #15
0
def __get_allocator_class(allocator_str):
    if allocator_str not in CPU_ALLOCATORS:
        log.error(
            "Unexpected CPU allocator specified: '{}', falling back to default: '{}'"
            .format(allocator_str, DEFAULT_ALLOCATOR))
        allocator_str = DEFAULT_ALLOCATOR

    return CPU_ALLOCATOR_NAME_TO_CLASS_MAP[allocator_str]
Beispiel #16
0
def __get_image(container):
    if REPO_DIGESTS in container.image.attrs:
        repo_digests = container.image.attrs[REPO_DIGESTS]
        if len(repo_digests) > 0:
            return repo_digests[0]

    log.error("Failed to extract image from container: '{}'".format(
        container.name))
    return ''
 def get_resource_usage(self, workload_ids: List[str]) -> GlobalResourceUsage:
     try:
         global_usage = GlobalResourceUsage(self.__get_usage_dict(workload_ids))
         log.debug("Got resource usage: %s", json.dumps(global_usage.serialize(), sort_keys=True, separators=(',', ':')))
         return global_usage
     except Exception:
         log.error("failed to get resource usage, returning empty usage")
         with self.__metric_lock:
             self.__get_resource_usage_failure_count += 1
         return GlobalResourceUsage({})
Beispiel #18
0
 def report_event(self, payload: dict):
     try:
         payload['ts'] = str(datetime.datetime.utcnow())
         event = {
             "uuid": str(uuid.uuid4()),
             "payload": payload
         }
         msg = get_event_msg(event)
         self.__q.put_nowait(msg)
     except Exception:
         self.__failed_msg_count += 1
         log.error("Failed to report event for payload: {}".format(payload))
Beispiel #19
0
def __schedule_loop(exit_handler: ExitHandler):
    log.info("Starting scheduling loop...")
    while True:
        try:
            sleep_time = _schedule_once(exit_handler)
            _notify_watchdog()
            log.debug("Scheduling thread sleeping for: '%d' seconds",
                      sleep_time)
            time.sleep(sleep_time)
        except Exception:
            log.error("Failed to run scheduling loop")
            exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
Beispiel #20
0
def get_duration_predictions(input_str: str) -> List[DurationPrediction]:
    try:
        # "0.05=0.29953;0.1=0.29953;0.15=0.29953;0.2=0.29953;0.25=0.29953;0.3=0.29953;0.35=0.29953;0.4=0.29953;0.45=0.29953;0.5=0.29953;0.55=0.29953;0.6=0.29953;0.65=0.29953;0.7=0.29953;0.75=0.29953;0.8=0.29953;0.85=0.29953;0.9=0.29953;0.95=0.29953"
        duration_predictions = []
        pairs = input_str.split(';')
        for p in pairs:
            k, v = p.split('=')
            duration_predictions.append(DurationPrediction(float(k), float(v)))

        return duration_predictions
    except Exception:
        log.error(
            "Failed to parse duration predictions: '{}'".format(input_str))
        return []
 def rebalance(self, request: AllocateRequest) -> AllocateResponse:
     try:
         self.__primary_rebalance_call_count += 1
         self.__should_fallback_immediately()
         return self.__primary_allocator.rebalance(request)
     except Exception as e:
         log.error(
             "Failed to rebalance workloads: '{}' with primary allocator: '{}', falling back to: '{}' because '{}'".format(
                 [w.get_id() for w in request.get_workloads().values()],
                 self.__primary_allocator.__class__.__name__,
                 self.__secondary_allocator.__class__.__name__,
                 e))
         self.__secondary_rebalance_call_count += 1
         return self.__secondary_allocator.rebalance(request)
 def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse:
     try:
         self.__primary_free_threads_call_count += 1
         self.__should_fallback_immediately()
         return self.__primary_allocator.free_threads(request)
     except Exception as e:
         log.error(
             "Failed to free threads for workload: '{}' with primary allocator: '{}', falling back to: '{}' because '{}'".format(
                 request.get_workload_id(),
                 self.__primary_allocator.__class__.__name__,
                 self.__secondary_allocator.__class__.__name__,
                 e))
         self.__secondary_free_threads_call_count += 1
         return self.__secondary_allocator.free_threads(request)
Beispiel #23
0
    def get_predictions(
        self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage
    ) -> Optional[ResourceUsagePredictions]:
        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager not yet set.")
            return None

        client_crt = get_client_cert_path(config_manager)
        client_key = get_client_key_path(config_manager)
        if client_crt is None or client_key is None:
            log.error("Failed to generate credential paths")
            return None

        url = get_url(config_manager)
        if url is None:
            log.error("Unable to generate prediction service url")
            return None

        body = self.__get_body(running_pods, resource_usage)
        if body is None:
            log.error("Unable to generate a prediction request body")
            return None

        predictions = get_predictions(client_crt, client_key, url, body)
        if predictions is None:
            log.error("Failed to get predictions")
            return None

        return ResourceUsagePredictions(predictions)
    def __remove_workload(self, workload_id):
        log.info("Removing workload: {}".format(workload_id))
        if workload_id not in self.__workloads:
            log.error("Attempted to remove unknown workload: '{}'".format(workload_id))
            return

        workload_map = self.get_workload_map_copy()

        request = self.__get_threads_request(workload_id, workload_map, "free")
        response = self.__cpu_allocator.free_threads(request)

        workload_map.pop(workload_id)
        self.__update_state(response.get_cpu(), workload_map)
        report_cpu_event(request, response)
    def _handle(self, event):
        try:
            if not self.__relevant(event):
                self.ignored_event(event, "irrelevant")
                return

            with self.__publish_lock:
                self.__publisher.publish()

            with self.__metric_lock:
                self.__publish_success_count += 1
        except Exception:
            with self.__metric_lock:
                self.__publish_failure_count += 1
            log.error("Failed to publish resource usage predictions")
Beispiel #26
0
def init():
    # Initialize currently running containers as workloads
    log.info("Isolating currently running workloads...")
    for workload in get_current_workloads(docker.from_env()):
        try:
            workload_manager.add_workload(workload)
        except Exception:
            log.error(
                "Failed to add currently running workload: '{}', maybe it exited."
                .format(workload.get_id()))

    log.info("Isolated currently running workloads.")
    # Start processing events after adding running workloads to avoid processing a die event before we add a workload
    event_manager.start_processing_events()
    _notify_ready()
Beispiel #27
0
    def get_free_threads(
            self,
            cpu: Cpu,
            workload_map:
            Dict[str, Workload],
            cpu_usage: Dict[str, float] = None) -> List[Thread]:

        if cpu_usage is None:
            log.error("CPU usage is required, defaulting to EMPTY threads being free.")
            return cpu.get_empty_threads()

        free_threads = []
        for c in get_free_cores(self.__threshold, cpu, workload_map, cpu_usage):
            free_threads += c.get_threads()

        return free_threads
Beispiel #28
0
def get_allocator(allocator_str, config_manager):
    if allocator_str not in CPU_ALLOCATORS:
        log.error(
            "Unexpected CPU allocator specified: '{}', falling back to default: '{}'"
            .format(allocator_str, DEFAULT_ALLOCATOR))
        allocator_str = DEFAULT_ALLOCATOR

    free_thread_provider = get_free_thread_provider(config_manager)
    if allocator_str != FORECAST_CPU_IP:
        return CPU_ALLOCATOR_NAME_TO_CLASS_MAP[allocator_str](
            free_thread_provider)

    return ForecastIPCpuAllocator(
        cpu_usage_predictor_manager=get_cpu_usage_predictor_manager(),
        config_manager=config_manager,
        free_thread_provider=free_thread_provider)
Beispiel #29
0
def _schedule_once(exit_handler: ExitHandler) -> float:
    try:
        log.debug("Running pending scheduled tasks.")
        schedule.run_pending()

        sleep_time = SCHEDULING_SLEEP_INTERVAL
        if schedule.next_run() is not None:
            sleep_time = schedule.idle_seconds()

        if sleep_time < 0:
            sleep_time = SCHEDULING_SLEEP_INTERVAL

        return sleep_time
    except Exception:
        log.error("Failed to run scheduling once")
        exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
Beispiel #30
0
def download_latest_cpu_model(path=get_cpu_model_file_path()):
    log.info("Downloading latest cpu prediction model.")
    latest_model = get_latest_cpu_model()
    if latest_model is None:
        log.error("Failed to download model because no model found.")
        return

    bucket_name = get_cpu_model_bucket_name()
    key = latest_model['Key']
    s3_client = boto3.client('s3')
    log.info(
        "Downloading latest cpu prediction model: '{}/{}' to: '{}'".format(
            bucket_name, key, path))
    s3_client.download_file(bucket_name, key, path)
    log.info("Downloaded latest cpu prediction model: '{}/{}' to: '{}'".format(
        bucket_name, key, path))