Ejemplo n.º 1
0
    def __update_workload(self, func, arg, workload_id):
        try:
            with self.__lock:
                log.debug("Acquired lock for func: {} on workload: {}".format(
                    func.__name__, workload_id))
                start_time = time.time()
                func(arg)
                stop_time = time.time()
                if self.__reg is not None:
                    self.__reg.distribution_summary(
                        self.__get_workload_processing_metric_name(
                            func.__name__),
                        self.__tags).record(stop_time - start_time)
                    self.__reg.distribution_summary(
                        WORKLOAD_PROCESSING_DURATION,
                        self.__tags).record(stop_time - start_time)

            log.debug("Released lock for func: {} on workload: {}".format(
                func.__name__, workload_id))
            return True
        except:
            self.__error_count += 1
            log.exception("Failed to execute func: {} on workload: {}".format(
                func.__name__, workload_id))
            return False
Ejemplo n.º 2
0
    def __process_events(self):
        while not self.__stopped:
            try:
                event = self.__q.get(timeout=self.__event_timeout)
                dequeue_time = time.time()
                log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth()))
                if self.__reg is not None:
                    self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment()
                    self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment()
                    self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY])
            except Empty:
                log.debug("Timed out waiting for event on queue.")
                continue

            for event_handler in self.__event_handlers:
                try:
                    log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION]))
                    event_handler.handle(event)
                    self.__report_succeeded_event(event_handler)
                except:
                    log.exception("Event handler: '{}' failed to handle event: '{}'".format(
                        type(event_handler).__name__, event))
                    self.__report_failed_event(event_handler)

            self.__q.task_done()
            self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment()
            self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth())
            self.__processed_count += 1
Ejemplo n.º 3
0
def get_cell_name():
    config_manager = get_config_manager()
    if config_manager is None:
        log.warning("Config manager is not yet set.")
        return UNKNOWN_CELL

    url = config_manager.get_str(REMOTE_ALLOCATOR_URL)
    if url is None:
        log.warning("No remote solver URL specified.")
        return UNKNOWN_CELL

    timeout = config_manager.get_int(MAX_SOLVER_RUNTIME,
                                     DEFAULT_MAX_SOLVER_RUNTIME)

    try:
        response = requests.get(url, timeout=timeout)
        cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None)
        if cell_name is None:
            log.warning("Titus isolation cell header is not set.")
            return UNKNOWN_CELL
        else:
            return cell_name
    except:
        log.exception("Failed to determine isolation cell.")
        return UNKNOWN_CELL
Ejemplo n.º 4
0
 def __sample_monitors(self):
     with self.__lock:
         for workload_id, monitor in self.__monitors.items():
             try:
                 Thread(target=monitor.sample).start()
             except:
                 log.exception("Failed to sample performance of workload: '{}'".format(workload_id))
 def get_name(self) -> str:
     url = "{}/cpu_allocator".format(self.__url)
     try:
         response = requests.get(url, timeout=self.__timeout)
         return "Remote({})".format(response.text)
     except:
         log.exception("Failed to GET cpu allocator name.")
         return UNKNOWN_CPU_ALLOCATOR
Ejemplo n.º 6
0
    def __report_metrics(self):
        try:
            tags = self.get_tags()

            for reporter in self.__reporters:
                reporter.report_metrics(tags)
        except Exception:
            log.exception("Failed to report metrics.")
Ejemplo n.º 7
0
 def report_event(self, payload: dict):
     try:
         payload['ts'] = str(datetime.datetime.utcnow())
         event = {"uuid": str(uuid.uuid4()), "payload": payload}
         msg = get_event_msg(event)
         self.__q.put_nowait(msg)
     except:
         self.__failed_msg_count += 1
         log.exception(
             "Failed to report event for payload: {}".format(payload))
Ejemplo n.º 8
0
 def __get_cpuset(self, container_name: str) -> str:
     try:
         self.__wait_for_files(container_name)
         log.info("getting cpuset.cpus for workload: '{}'".format(
             container_name))
         return get_cpuset(container_name)
     except:
         log.exception("Failed to read cpuset for workload: '{}'".format(
             container_name))
         return None
Ejemplo n.º 9
0
 def rebalance(self, request: AllocateRequest) -> AllocateResponse:
     try:
         self.__primary_rebalance_call_count += 1
         return self.__primary_allocator.rebalance(request)
     except:
         log.exception(
             "Failed to rebalance workloads: '{}' with primary allocator: '{}', falling back to: '{}'".format(
                 [w.get_id() for w in request.get_workloads().values()],
                 self.__primary_allocator.__class__.__name__,
                 self.__secondary_allocator.__class__.__name__))
         self.__secondary_rebalance_call_count += 1
         return self.__secondary_allocator.rebalance(request)
Ejemplo n.º 10
0
 def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse:
     try:
         self.__primary_free_threads_call_count += 1
         return self.__primary_allocator.free_threads(request)
     except:
         log.exception(
             "Failed to free threads for workload: '{}' with primary allocator: '{}', falling back to: '{}'".format(
                 request.get_workload_id(),
                 self.__primary_allocator.__class__.__name__,
                 self.__secondary_allocator.__class__.__name__))
         self.__secondary_free_threads_call_count += 1
         return self.__secondary_allocator.free_threads(request)
Ejemplo n.º 11
0
    def __update_burst_cpusets(self):
        empty_thread_ids = self.__get_empty_thread_ids()
        error_count = 0
        for b_w in get_burst_workloads(self.__workloads.values()):
            log.info("updating burst workload: '{}'".format(b_w.get_id()))
            try:
                self.__cgroup_manager.set_cpuset(b_w.get_id(), empty_thread_ids)
            except:
                error_count += 1
                log.exception("Failed to update burst workload: '{}', maybe it's gone.".format(b_w.get_id()))

        self.__error_count += error_count
Ejemplo n.º 12
0
def __schedule_loop(exit_handler: ExitHandler):
    log.info("Starting scheduling loop...")
    while True:
        try:
            sleep_time = _schedule_once(exit_handler)
            _notify_watchdog()
            log.debug("Scheduling thread sleeping for: '%d' seconds",
                      sleep_time)
            time.sleep(sleep_time)
        except:
            log.exception("Failed to run scheduling loop")
            exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
Ejemplo n.º 13
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        try:
            if is_kubernetes():
                workloads.append(get_workload_from_kubernetes(container.name))
            else:
                workloads.append(get_workload_from_disk(container.name))
        except:
            log.exception("Failed to read environment for container: '%s'",
                          container.name)

    return workloads
Ejemplo n.º 14
0
    def report_metrics(self, tags):
        log.debug("Reporting metrics")
        try:
            # Workload manager metrics
            self.__reg.gauge(RUNNING, tags).set(1)

            self.__reg.gauge(ADDED_KEY, tags).set(
                self.__workload_manager.get_added_count())
            self.__reg.gauge(REMOVED_KEY, tags).set(
                self.__workload_manager.get_removed_count())
            self.__reg.gauge(SUCCEEDED_KEY, tags).set(
                self.__workload_manager.get_success_count())
            self.__reg.gauge(FAILED_KEY, tags).set(
                self.__workload_manager.get_error_count())
            self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set(
                len(self.__workload_manager.get_workloads()))

            # Allocator metrics
            self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set(
                self.__workload_manager.get_allocator_call_duration_sum_secs())
            self.__reg.gauge(FALLBACK_ALLOCATOR_COUNT, tags).set(
                self.__workload_manager.get_fallback_allocator_calls_count())
            self.__reg.gauge(IP_ALLOCATOR_TIMEBOUND_COUNT, tags).set(
                self.__workload_manager.
                get_time_bound_ip_allocator_solution_count())

            # Event manager metrics
            self.__reg.gauge(QUEUE_DEPTH_KEY,
                             tags).set(self.__event_manager.get_queue_depth())
            self.__reg.gauge(EVENT_SUCCEEDED_KEY, tags).set(
                self.__event_manager.get_success_count())
            self.__reg.gauge(EVENT_FAILED_KEY,
                             tags).set(self.__event_manager.get_error_count())
            self.__reg.gauge(EVENT_PROCESSED_KEY, tags).set(
                self.__event_manager.get_processed_count())

            # CPU metrics
            cross_package_violation_count = len(
                get_cross_package_violations(
                    self.__workload_manager.get_cpu()))
            shared_core_violation_count = len(
                get_shared_core_violations(self.__workload_manager.get_cpu()))
            self.__reg.gauge(PACKAGE_VIOLATIONS_KEY,
                             tags).set(cross_package_violation_count)
            self.__reg.gauge(CORE_VIOLATIONS_KEY,
                             tags).set(shared_core_violation_count)
            log.debug("Reported metrics")

        except:
            log.exception("Failed to report metric")
Ejemplo n.º 15
0
def init():
    # Initialize currently running containers as workloads
    log.info("Isolating currently running workloads...")
    for workload in get_current_workloads(docker.from_env()):
        try:
            workload_manager.add_workload(workload)
        except:
            log.exception(
                "Failed to add currently running workload: '{}', maybe it exited."
                .format(workload.get_id()))

    log.info("Isolated currently running workloads.")
    # Start processing events after adding running workloads to avoid processing a die event before we add a workload
    event_manager.start_processing_events()
Ejemplo n.º 16
0
def get_duration_predictions(input_str: str) -> List[DurationPrediction]:
    try:
        # "0.05=0.29953;0.1=0.29953;0.15=0.29953;0.2=0.29953;0.25=0.29953;0.3=0.29953;0.35=0.29953;0.4=0.29953;0.45=0.29953;0.5=0.29953;0.55=0.29953;0.6=0.29953;0.65=0.29953;0.7=0.29953;0.75=0.29953;0.8=0.29953;0.85=0.29953;0.9=0.29953;0.95=0.29953"
        duration_predictions = []
        pairs = input_str.split(';')
        for p in pairs:
            k, v = p.split('=')
            duration_predictions.append(DurationPrediction(float(k), float(v)))

        return duration_predictions
    except:
        log.exception(
            "Failed to parse duration predictions: '{}'".format(input_str))
        return []
Ejemplo n.º 17
0
 def __update_workload(self, func, arg, workload_id):
     try:
         with self.__lock:
             log.info("Acquired lock for func: {} on workload: {}".format(func.__name__, workload_id))
             start_time = time.time()
             func(arg)
             stop_time = time.time()
             self.__allocator_call_duration_sum_secs = stop_time - start_time
         log.info("Released lock for func: {} on workload: {}".format(func.__name__, workload_id))
         return True
     except:
         self.__error_count += 1
         log.exception("Failed to execute func: {} on workload: {}".format(func.__name__, workload_id))
         return False
Ejemplo n.º 18
0
    def __set_cpuset(self, container_name, thread_ids):
        thread_ids_str = self.__get_thread_ids_str(thread_ids)

        try:
            self.__wait_for_files(container_name)
            log.info("updating workload: '{}' to cpuset.cpus: '{}'".format(
                container_name, thread_ids_str))
            set_cpuset(container_name, thread_ids_str)
            self.__write_succeeded(container_name)
        except:
            self.__write_failed()
            log.exception(
                "Failed to apply cpuset to threads: '{}' for workload: '{}'".
                format(thread_ids_str, container_name))
    def __snapshot_usage_raw(self):
        try:
            # Avoid making a metrics query on a potentially empty dataset which causes the query command to fail, which
            # causes noisy logs which look like failures.
            workload_manager = get_workload_manager()
            if workload_manager is None or len(
                    workload_manager.get_workloads()) == 0:
                log.info('No workloads so skipping pcp snapshot.')
                return

            instance_filter = "INVALID_INSTANCE_FILTER"
            if is_kubernetes():
                instance_filter = '.*titus-executor.*.service'
            else:
                instance_filter = '/containers.slice/[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}'

            # pmrep -a /var/log/pcp/pmlogger/$(hostname)/ -S -60m -t 1m -y s -o csv -i .*titus-executor.*.service  cgroup.cpuacct.usage cgroup.memory.usage
            snapshot_cmd_fmt = """ pmrep -a {0} \
                    -S -{1}s \
                    -T -0s \
                    -t {2}s \
                    -y s \
                    -o csv \
                    -i {3} \
                    cgroup.cpuacct.usage \
                    cgroup.memory.usage \
                    titus.network.in.bytes \
                    titus.network.out.bytes \
                    titus.disk.bytes_used """

            cmd_str = snapshot_cmd_fmt.format(get_pcp_archive_path(),
                                              self.__relative_start_sec,
                                              self.__interval_sec,
                                              instance_filter)

            log.info('Snapshoting usage from pcp: {}'.format(' '.join(
                cmd_str.split())))

            byte_array = subprocess.check_output(
                cmd_str, shell=True, timeout=self.__query_timeout_sec)
            raw_csv_snapshot = byte_array.decode('utf-8')
            usages = get_resource_usage(raw_csv_snapshot,
                                        self.__interval_count,
                                        self.__interval_sec)

            with self.__lock:
                self.__usages = usages
        except:
            log.exception("Failed to snapshot pcp data or compute usages")
Ejemplo n.º 20
0
    def __groom(self, event):
        if not event[ACTION] == CREATE:
            self.__groomed_q.put_nowait(event)
            return

        try:
            name = get_container_name(event)
            log.info("Grooming create event for: '{}'".format(name))

            self.__file_manger.wait_for_files(name)
            log.info("Groomed create event for: '{}'".format(name))
            self.__groomed_q.put_nowait(event)
        except:
            self.__error_event_count += 1
            log.exception("Dropping CREATE event, because failed to wait for files for: '{}'".format(name))
Ejemplo n.º 21
0
    def __watch(self):
        while True:
            try:
                instance_id = get_config_manager().get_str("EC2_INSTANCE_ID")
                field_selector = "spec.nodeName={}".format(instance_id)
                log.info("Watching pods with field selector: %s",
                         field_selector)

                v1 = client.CoreV1Api()
                w = watch.Watch()

                for event in w.stream(v1.list_pod_for_all_namespaces,
                                      field_selector=field_selector):
                    self.__handle_event(event)
            except:
                log.exception("pod watch thread failed")
Ejemplo n.º 22
0
def free_threads():
    try:
        body = request.get_json()
        log.info("Processing free threads request: {}".format(body))
        threads_request = get_threads_request(body)
        response = get_free_cpu_allocator().free_threads(threads_request)

        global free_threads_success_count
        free_threads_success_count += 1

        return jsonify(response.to_dict())
    except:
        log.exception("Failed to free threads")
        global free_threads_failure_count
        free_threads_failure_count += 1
        return "Failed to free threads", 500
Ejemplo n.º 23
0
def rebalance():
    try:
        body = request.get_json()
        log.info("Processing rebalance threads request: {}".format(body))
        rebalance_request = get_rebalance_request(body)
        response = get_rebalance_cpu_allocator().rebalance(rebalance_request)

        global rebalance_success_count
        rebalance_success_count += 1

        return jsonify(response.to_dict())
    except:
        log.exception("Failed to rebalance")
        global rebalance_failure_count
        rebalance_failure_count += 1
        return "Failed to rebalance", 500
    def __process_events(self):
        while True:
            try:
                msg = self.__q.get()
                log.debug("Sending event log message: {}".format(msg))
                response = send_event_msg(msg, self.__address)

                if response.status_code != 200:
                    log.error("Re-enqueuing failed event log message: {}".format(response.content))
                    self.__retry_msg_count += 1
                    self.__q.put_nowait(msg)
                else:
                    self.__succeeded_msg_count += 1
            except:
                self.__failed_msg_count += 1
                log.exception("Failed to process event log message.")
Ejemplo n.º 25
0
def _schedule_once(exit_handler: ExitHandler) -> float:
    try:
        log.debug("Running pending scheduled tasks.")
        schedule.run_pending()

        sleep_time = SCHEDULING_SLEEP_INTERVAL
        if schedule.next_run() is not None:
            sleep_time = schedule.idle_seconds()

        if sleep_time < 0:
            sleep_time = SCHEDULING_SLEEP_INTERVAL

        return sleep_time
    except:
        log.exception("Failed to run scheduling once")
        exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
Ejemplo n.º 26
0
def update_numa_balancing(workload: Workload, cpu: Cpu):
    try:
        config_manager = get_config_manager()
        dynamic_numa_balancing_enabled = config_manager.get_bool(
            TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING,
            DEFAULT_TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING)

        if not dynamic_numa_balancing_enabled:
            enable_numa_balancing()
            return

        if _occupies_entire_cpu(workload, cpu):
            disable_numa_balancing()
        else:
            enable_numa_balancing()
    except:
        log.exception("Failed to update NUMA balancing.")
Ejemplo n.º 27
0
def rebalance():
    try:
        request_ip = request.headers.get(FORWARDED_FOR_HEADER)
        log.info("Processing rebalance threads request (from, proxy): {}".format(request_ip))

        body = request.get_json()
        rebalance_request = get_rebalance_request(body)
        response = get_rebalance_cpu_allocator().rebalance(rebalance_request)

        global rebalance_success_count
        rebalance_success_count += 1

        log.info("Processed rebalance threads request (from, proxy): {}".format(request_ip))
        return jsonify(response.to_dict())
    except:
        log.exception("Failed to rebalance")
        global rebalance_failure_count
        rebalance_failure_count += 1
        return "Failed to rebalance", 500
Ejemplo n.º 28
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                labels = container.labels
                cpu = int(__get_value(labels, CPU_LABEL_KEY, -1))
                mem = int(__get_value(labels, MEM_LABEL_KEY, -1))
                disk = int(__get_value(labels, DISK_LABEL_KEY, -1))
                network = int(__get_value(labels, DISK_LABEL_KEY, -1))
                app_name = __get_value(labels, APP_NAME_LABEL_KEY)
                owner_email = __get_value(labels, OWNER_EMAIL_LABEL_KEY)
                command = __get_value(labels, COMMAND_LABEL_KEY)
                entrypoint = __get_value(labels, ENTRYPOINT_LABEL_KEY)
                job_type = __get_value(labels, JOB_TYPE_LABEL_KEY)
                workload_type = __get_value(labels, WORKLOAD_TYPE_LABEL_KEY)
                image = __get_image(container)

                workloads.append(
                    Workload(identifier=workload_id,
                             thread_count=cpu,
                             mem=mem,
                             disk=disk,
                             network=network,
                             app_name=app_name,
                             owner_email=owner_email,
                             image=image,
                             command=command,
                             entrypoint=entrypoint,
                             job_type=job_type,
                             workload_type=workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected labels'")

    return workloads
Ejemplo n.º 29
0
    def __process_events(self):
        while not self.__stopped:
            try:
                event = self.__groomed_q.get(timeout=self.__event_timeout)
            except Empty:
                log.debug("Timed out waiting for event on queue.")
                continue

            for event_handler in self.__event_handlers:
                try:
                    event_handler.handle(event)
                    self.__success_event_count += 1
                except:
                    log.exception("Event handler: '{}' failed to handle event: '{}'".format(
                        type(event_handler).__name__, event))
                    self.__error_event_count += 1

            self.__raw_q.task_done()
            self.__processed_event_count += 1
            log.debug("processed event count: {}".format(self.get_success_count()))
Ejemplo n.º 30
0
def get_current_workloads(docker_client):
    workloads = []
    for container in docker_client.containers.list():
        workload_id = container.name
        if __has_required_labels(container):
            try:
                cpu = int(container.labels[CPU_LABEL_KEY])
                workload_type = container.labels[WORKLOAD_TYPE_LABEL_KEY]
                workloads.append(Workload(workload_id, cpu, workload_type))
                log.info("Found running workload: '{}'".format(workload_id))
            except:
                log.exception(
                    "Failed to parse labels for container: '{}'".format(
                        container.name))
        else:
            log.warning(
                "Found running workload: '{}' without expected label: '{}'".
                format(workload_id, CPU_LABEL_KEY))

    return workloads