Beispiel #1
0
 def __wait_for_files(container_name):
     cgroup_file_wait_timeout = get_config_manager().get_float(
         WAIT_CGROUP_FILE_KEY, DEFAULT_WAIT_CGROUP_FILE_SEC)
     json_file_wait_timeout = get_config_manager().get_float(
         WAIT_JSON_FILE_KEY, DEFAULT_WAIT_JSON_FILE_SEC)
     wait_for_files(container_name, cgroup_file_wait_timeout,
                    json_file_wait_timeout)
def get_workload_from_kubernetes(identifier) -> Optional[KubernetesWorkload]:
    if not managers_are_initialized():
        log.error(
            "Cannot get workload from kubernetes because managers aren't initialized"
        )
        return None

    retry_count = get_config_manager().get_int(
        GET_WORKLOAD_RETRY_COUNT, DEFAULT_GET_WORKLOAD_RETRY_COUNT)
    retry_interval = get_config_manager().get_float(
        GET_WORKLOAD_RETRY_INTERVAL_SEC,
        DEFAULT_GET_WORKLOAD_RETRY_INTERVAL_SEC)

    pod_manager = get_pod_manager()
    for i in range(retry_count):
        log.info("Getting pod from kubernetes: %s", identifier)
        pod = pod_manager.get_pod(identifier)
        if pod is not None:
            log.info("Got pod from kubernetes: %s", identifier)
            return KubernetesWorkload(pod)

        log.info("Retrying getting pod from kubernetes in %s seconds",
                 retry_interval)
        time.sleep(retry_interval)

    log.error("Failed to get pod from kubernetes: %s", identifier)
    return None
Beispiel #3
0
def get_cpu_model_prefix_name():
    config_manager = get_config_manager()
    prefix = config_manager.get_str(MODEL_BUCKET_PREFIX,
                                    DEFAULT_MODEL_BUCKET_PREFIX)
    leaf = config_manager.get_str(MODEL_BUCKET_LEAF, DEFAULT_MODEL_BUCKET_LEAF)

    format_str = get_config_manager().get_str(MODEL_PREFIX_FORMAT_STR)
    if format_str is None:
        return None

    return format_str.format(prefix, leaf)
    def get_cpu_predictions(self, workloads: List[Workload], resource_usage: GlobalResourceUsage) \
            -> Optional[Dict[str, float]]:

        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager is not yet set")
            return {}

        cpu_usage = resource_usage.get_cpu_usage()
        if cpu_usage is None:
            log.warning("No cpu usage")
            return {}
        pred_env = PredEnvironment(config_manager.get_region(),
                                   config_manager.get_environment(),
                                   datetime.utcnow().hour)

        predictions = {}
        for workload in workloads:
            workload_cpu_usage = cpu_usage.get(workload.get_id(), None)
            if workload_cpu_usage is None:
                log.warning("No CPU usage for workload: %s", workload.get_id())
                continue

            workload_cpu_usage = [float(u) for u in workload_cpu_usage]
            pred_cpus = self.predict(workload, workload_cpu_usage, pred_env)
            predictions[workload.get_id()] = pred_cpus

        return predictions
Beispiel #5
0
def get_workload_response(workload: Workload,
                          cpu: Cpu) -> Optional[WorkloadAllocateResponse]:
    thread_ids = get_threads(cpu, workload.get_id())
    cpu_shares = get_cpu_shares(workload)
    cpu_quota = get_cpu_quota(workload)

    if len(thread_ids) < 1:
        return None

    memory_migrate = DEFAULT_TITUS_ISOLATE_MEMORY_MIGRATE
    memory_spread_page = DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_PAGE
    memory_spread_slab = DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_SLAB

    config_manager = get_config_manager()
    if config_manager is not None:
        memory_migrate = config_manager.get_cached_bool(
            TITUS_ISOLATE_MEMORY_MIGRATE, DEFAULT_TITUS_ISOLATE_MEMORY_MIGRATE)
        memory_spread_page = config_manager.get_cached_bool(
            TITUS_ISOLATE_MEMORY_SPREAD_PAGE,
            DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_PAGE)
        memory_spread_slab = config_manager.get_cached_bool(
            TITUS_ISOLATE_MEMORY_SPREAD_SLAB,
            DEFAULT_TITUS_ISOLATE_MEMORY_SPREAD_SLAB)

    return WorkloadAllocateResponse(workload_id=workload.get_id(),
                                    thread_ids=thread_ids,
                                    cpu_shares=cpu_shares,
                                    cpu_quota=cpu_quota,
                                    memory_migrate=memory_migrate,
                                    memory_spread_page=memory_spread_page,
                                    memory_spread_slab=memory_spread_slab)
Beispiel #6
0
def get_required_property(key):
    value = get_config_manager().get_str(key)
    if value is None:
        log.error("Failed to retrieve property: '{}'".format(key))
        return None

    return value
    def __init__(self, primary_cpu_allocator: CpuAllocator, secondary_cpu_allocator: CpuAllocator):
        if primary_cpu_allocator is None:
            raise ValueError("Must be provided a primary cpu allocator.")

        if secondary_cpu_allocator is None:
            raise ValueError("Must be provided a secondary cpu allocator.")

        self.__reg = None

        self.__primary_allocator = primary_cpu_allocator
        self.__secondary_allocator = secondary_cpu_allocator

        self.__primary_assign_threads_call_count = 0
        self.__primary_free_threads_call_count = 0
        self.__primary_rebalance_call_count = 0

        self.__secondary_assign_threads_call_count = 0
        self.__secondary_free_threads_call_count = 0
        self.__secondary_rebalance_call_count = 0

        self.__queue_depth_fallback_count = 0

        cm = get_config_manager()
        self.__fallback_queue_depth = cm.get_cached_int(FALLBACK_QUEUE_DEPTH, DEFAULT_FALLBACK_QUEUE_DEPTH)

        log.info(
            "Created FallbackCpuAllocator with primary cpu allocator: '{}' and secondary cpu allocator: '{}', fallback queue depth: '{}'".format(
                self.__primary_allocator.__class__.__name__,
                self.__secondary_allocator.__class__.__name__,
                self.__fallback_queue_depth))
    def __init__(self,
                 event_iterable,
                 event_handlers,
                 event_timeout=DEFAULT_EVENT_TIMEOUT_SECS):
        self.__reg = None
        self.__stopped = False
        self.__q = Queue()

        self.__events = event_iterable
        self.__event_handlers = event_handlers
        self.__event_timeout = event_timeout

        self.__success_event_count = 0
        self.__error_event_count = 0
        self.__processed_event_count = 0

        self.__started = False
        self.__started_lock = Lock()

        self.__processing_thread = Thread(target=self.__process_events)
        self.__pulling_thread = Thread(target=self.__pull_events)

        config_manager = get_config_manager()

        rebalance_frequency = config_manager.get_float(
            REBALANCE_FREQUENCY_KEY, DEFAULT_REBALANCE_FREQUENCY)
        if rebalance_frequency > 0:
            schedule.every(rebalance_frequency).seconds.do(self.__rebalance)

        reconcile_frequency = config_manager.get_float(
            RECONCILE_FREQUENCY_KEY, DEFAULT_RECONCILE_FREQUENCY)
        if reconcile_frequency > 0:
            schedule.every(reconcile_frequency).seconds.do(self.__reconcile)
def get_cpu_shares(workload: Workload) -> int:
    if workload.is_opportunistic():
        opportunistic_shares_scale = get_config_manager().get_int(
            OPPORTUNISTIC_SHARES_SCALE_KEY, DEFAULT_OPPORTUNISTIC_SHARES_SCALE)
        return workload.get_thread_count() * opportunistic_shares_scale

    return workload.get_thread_count() * DEFAULT_SHARES_SCALE
Beispiel #10
0
def get_prom_url() -> str:
    cm = get_config_manager()

    # e.g. titusprometheus.us-east-1.staging01cell001.test.netflix.net
    default_host = f'titusprometheus.{cm.get_region()}.{cm.get_stack()}.{cm.get_environment()}.netflix.net'
    host = cm.get_cached_str(PROMETHEUS_HOST_OVERRIDE, default_host)
    return f'http://{host}/api/v1/query_range'
Beispiel #11
0
    def get_predictions(
        self, running_pods: List[V1Pod], resource_usage: GlobalResourceUsage
    ) -> Optional[ResourceUsagePredictions]:
        config_manager = get_config_manager()
        if config_manager is None:
            log.warning("Config manager not yet set.")
            return None

        client_crt = get_client_cert_path(config_manager)
        client_key = get_client_key_path(config_manager)
        if client_crt is None or client_key is None:
            log.error("Failed to generate credential paths")
            return None

        url = get_url(config_manager)
        if url is None:
            log.error("Unable to generate prediction service url")
            return None

        body = self.__get_body(running_pods, resource_usage)
        if body is None:
            log.error("Unable to generate a prediction request body")
            return None

        predictions = get_predictions(client_crt, client_key, url, body)
        if predictions is None:
            log.error("Failed to get predictions")
            return None

        return ResourceUsagePredictions(predictions)
Beispiel #12
0
 def __get_request_metadata(self, request_type) -> dict:
     config_manager = get_config_manager()
     return {
         "type": request_type,
         "instance_id": self.__instance_id,
         "region": config_manager.get_region(),
         "environment": config_manager.get_environment()
     }
Beispiel #13
0
 def __update_local_model(self):
     cpu_predictor = get_config_manager().get_str(CPU_PREDICTOR, DEFAULT_CPU_PREDICTOR)
     if cpu_predictor == LEGACY_CPU_PREDICTOR:
         download_latest_cpu_model()
         with self.__lock:
             self.__cpu_usage_predictor = CpuUsagePredictor(get_cpu_model_file_path())
     else:
         log.info("Skipping model update.  CPU predictor: %s", cpu_predictor)
Beispiel #14
0
    def __init__(self, free_thread_provider=EmptyFreeThreadProvider()):

        self.__reg = None
        self.__cache = {}
        self.__time_bound_call_count = 0

        self.__solver_max_runtime_secs = get_config_manager().get_float(
            MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME)
        self.__free_thread_provider = free_thread_provider
    def __init__(self, free_thread_provider):
        config_manager = get_config_manager()

        self.__url = config_manager.get_str(REMOTE_ALLOCATOR_URL, "http://localhost:7501")
        solver_max_runtime_secs = config_manager.get_float(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME)
        solver_max_connect_secs = config_manager.get_float(MAX_SOLVER_CONNECT_SEC, DEFAULT_MAX_SOLVER_CONNECT_SEC)
        self.__timeout = (solver_max_connect_secs, solver_max_runtime_secs)
        self.__headers = {'Content-Type': "application/json"}
        self.__reg = None
    def __init__(self, exit_handler: ExitHandler):
        self.__exit_handler = exit_handler
        self.__config_manager = get_config_manager()
        self.__registry = None
        self.__oppo = None

        self.__custom_api = kubernetes.client.CustomObjectsApi(
            kubernetes.config.new_client_from_config(
                config_file=DEFAULT_KUBECONFIG_PATH))
Beispiel #17
0
    def __set_address(self):
        config_manager = get_config_manager()
        region = config_manager.get_region()
        env = config_manager.get_environment()
        format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR)
        stream = 'titus_isolate'

        self.__address = format_str.format(region, env, stream)
        log.info("Set keystone address to: {}".format(self.__address))
    def __init__(self):
        self.__config_manager = get_config_manager()
        self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID)

        kubeconfig = self.get_kubeconfig_path()
        self.__core_api = kubernetes.client.CoreV1Api(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))
        # NOTE[jigish]:  This API depends on the OpportunisticResource CRD. See the readme for how to create it.
        self.__custom_api = kubernetes.client.CustomObjectsApi(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))
Beispiel #19
0
def get_cpu_model_bucket_name():
    format_str = get_required_property(MODEL_BUCKET_FORMAT_STR)
    if format_str is None:
        return None

    config_manager = get_config_manager()
    region = config_manager.get_region()
    env = config_manager.get_environment()

    return format_str.format(region, env)
Beispiel #20
0
    def __get_tags():
        ec2_instance_id = 'EC2_INSTANCE_ID'

        tags = {}
        if ec2_instance_id in os.environ:
            tags["node"] = os.environ[ec2_instance_id]

        allocator_name = get_allocator_class(get_config_manager()).__name__
        tags["cpu_allocator"] = allocator_name

        return tags
Beispiel #21
0
    def __get_address(self) -> Optional[str]:
        config_manager = get_config_manager()
        region = config_manager.get_region()
        env = config_manager.get_environment()
        format_str = config_manager.get_str(EVENT_LOG_FORMAT_STR)
        if format_str is None:
            log.warning("Keystone is not enabled in this region env: %s %s", region, env)
            return None

        stream = 'titus_isolate'
        return format_str.format(region, env, stream)
Beispiel #22
0
    def get_cpu_predictor(self) -> Optional[SimpleCpuPredictor]:
        config_manager = get_config_manager()
        cpu_predictor = config_manager.get_str(CPU_PREDICTOR, DEFAULT_CPU_PREDICTOR)
        log.info("Using cpu predictor: %s", cpu_predictor)

        if cpu_predictor == SERVICE_CPU_PREDICTOR:
            return self.__resource_usage_predictor

        if cpu_predictor == LEGACY_CPU_PREDICTOR:
            with self.__lock:
                return self.__cpu_usage_predictor

        return None
    def __watch(self):
        while True:
            try:
                instance_id = get_config_manager().get_str("EC2_INSTANCE_ID")
                field_selector = "spec.nodeName={}".format(instance_id)
                log.info("Watching pods with field selector: %s",
                         field_selector)

                v1 = client.CoreV1Api()
                w = watch.Watch()

                for event in w.stream(v1.list_pod_for_all_namespaces,
                                      field_selector=field_selector):
                    self.__handle_event(event)
            except:
                log.exception("pod watch thread failed")
Beispiel #24
0
    def __init__(self, workload_manager: WorkloadManager,
                 window_publisher: OpportunisticWindowPublisher):

        super().__init__(workload_manager)
        self.__window_publisher = window_publisher

        self.__reg = None
        self.__fail_count = 0
        self.__skip_count = 0
        self.__success_count = 0
        self.__reclaimed_cpu_count = None

        self.__config_manager = get_config_manager()
        self.__workload_monitor_manager = get_workload_monitor_manager()
        self.__cpu_usage_predictor_manager = get_cpu_usage_predictor_manager()

        self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID)
def update_numa_balancing(workload: Workload, cpu: Cpu):
    try:
        config_manager = get_config_manager()
        dynamic_numa_balancing_enabled = config_manager.get_bool(
            TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING,
            DEFAULT_TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING)

        if not dynamic_numa_balancing_enabled:
            enable_numa_balancing()
            return

        if _occupies_entire_cpu(workload, cpu):
            disable_numa_balancing()
        else:
            enable_numa_balancing()
    except Exception:
        log.error("Failed to update NUMA balancing.")
Beispiel #26
0
def isolate_workload(workload_id, timeout=None):
    if timeout is None:
        timeout = get_config_manager().get_float(
            TITUS_ISOLATE_BLOCK_SEC, DEFAULT_TITUS_ISOLATE_BLOCK_SEC)

    deadline = time.time() + timeout
    while time.time() < deadline:
        if get_workload_manager().is_isolated(workload_id):
            return json.dumps({'workload_id': workload_id}), 200, {
                'ContentType': 'application/json'
            }
        time.sleep(0.1)

    log.error("Failed to isolate workload: '{}'".format(workload_id))
    return json.dumps({'unknown_workload_id': workload_id}), 404, {
        'ContentType': 'application/json'
    }
    def __init__(self, event_iterable, event_handlers, event_timeout=DEFAULT_EVENT_TIMEOUT_SECS):
        self.__reg = None
        self.__tags = None
        self.__stopped = False
        self.__q = Queue()

        self.__events = event_iterable
        self.__event_handlers = event_handlers
        self.__event_timeout = event_timeout

        self.__processed_count = 0

        self.__started = False
        self.__started_lock = Lock()

        self.__processing_thread = Thread(target=self.__process_events)
        self.__pulling_thread = Thread(target=self.__pull_events)
        self.last_successful_event_epoch_s = 0

        config_manager = get_config_manager()

        # Every instance of titus-isolate getting restarted at once produces scheduling spikes of events like
        # rebalance
        random_jitter = randrange(10)  # 0-9 inclusive

        rebalance_frequency = config_manager.get_float(REBALANCE_FREQUENCY_KEY, DEFAULT_REBALANCE_FREQUENCY)
        if rebalance_frequency > 0:
            schedule.every(rebalance_frequency + random_jitter).seconds.do(self.__rebalance)

        reconcile_frequency = config_manager.get_float(RECONCILE_FREQUENCY_KEY, DEFAULT_RECONCILE_FREQUENCY)
        if reconcile_frequency > 0:
            schedule.every(reconcile_frequency + random_jitter).seconds.do(self.__reconcile)

        oversubscribe_frequency = config_manager.get_float(OVERSUBSCRIBE_FREQUENCY_KEY,
                                                           DEFAULT_OVERSUBSCRIBE_FREQUENCY)
        if oversubscribe_frequency > 0:
            schedule.every(oversubscribe_frequency + random_jitter).seconds.do(self.__oversubscribe)

        predict_resource_usage_frequency = config_manager.get_float(PREDICT_RESOURCE_USAGE_FREQUENCY_KEY,
                                                                    DEFAULT_PREDICT_RESOURCE_USAGE_FREQUENCY)

        if predict_resource_usage_frequency > 0:
            schedule.every(predict_resource_usage_frequency + random_jitter).seconds.do(self.__predict_usage)
Beispiel #28
0
    def __init__(self, free_thread_provider):
        config_manager = get_config_manager()
        self.__endpoint = config_manager.get_cached_str(
            GRPC_REMOTE_ALLOC_ENDPOINT, None)
        if self.__endpoint is None:
            raise Exception("Could not get remote allocator endpoint address.")
        self.__call_timeout_secs = 1000.0 * config_manager.get_cached_int(
            GRPC_REMOTE_ALLOC_CLIENT_CALL_TIMEOUT_MS,
            GRPC_REMOTE_ALLOC_DEFAULT_CLIENT_CALL_TIMEOUT_MS)

        self.__stub = self.__create_stub()
        self.__instance_ctx = self.__pull_context()
        self.__reg = None
        self.__empty_cpu = get_cpu_from_env()
        self.__natural2original_indexing = self.__empty_cpu.get_natural_indexing_2_original_indexing(
        )
        self.__original2natural_indexing = {
            v: k
            for k, v in self.__natural2original_indexing.items()
        }
    def __init__(self, cpu: Cpu, cgroup_manager: CgroupManager, cpu_allocator: CpuAllocator):

        self.__reg = None
        self.__lock = Lock()
        self.__instance_id = get_config_manager().get_str(EC2_INSTANCE_ID)

        self.__cpu_allocator = cpu_allocator

        self.__error_count = 0
        self.__added_count = 0
        self.__removed_count = 0
        self.__rebalanced_count = 0
        self.__added_to_full_cpu_count = 0
        self.__allocator_call_duration_sum_secs = 0

        self.__cpu = cpu
        self.__cgroup_manager = cgroup_manager
        self.__wmm = get_workload_monitor_manager()
        self.__workloads = {}

        log.info("Created workload manager")
    def __init__(self, exit_handler: ExitHandler):
        self.__exit_handler = exit_handler
        self.__config_manager = get_config_manager()
        self.__node_name = self.__config_manager.get_str(EC2_INSTANCE_ID)

        kubeconfig = self.get_kubeconfig_path()
        self.__core_api = kubernetes.client.CoreV1Api(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))
        self.__custom_api = kubernetes.client.CustomObjectsApi(
            kubernetes.config.new_client_from_config(config_file=kubeconfig))

        self.__lock = Lock()
        self.__opportunistic_resources = {}

        oversubscribe_frequency = self.__config_manager.get_float(
            OVERSUBSCRIBE_FREQUENCY_KEY, DEFAULT_OVERSUBSCRIBE_FREQUENCY)
        if oversubscribe_frequency > 0:
            watch_thread = Thread(target=self.__watch)
            watch_thread.start()
        else:
            log.info(
                "Skipping opportunistic resource watch, as opportunistic publishing is not configured."
            )