def translate_structure_doc_to_timeseries(doc):
    try:
        struct_name = doc["name"]
        timestamp = int(time.time())

        timeseries_list = list()
        for resource in doc["resources"]:
            for doc_metric in doc["resources"][resource]:
                if doc_metric in PERSIST_METRICS and doc_metric in doc[
                        "resources"][resource]:
                    value = doc["resources"][resource][doc_metric]
                    if value or value == 0:
                        metric = ".".join([doc["type"], resource, doc_metric])
                        timeseries = dict(metric=metric,
                                          value=value,
                                          timestamp=timestamp,
                                          tags={"structure": struct_name})
                        timeseries_list.append(timeseries)
                    else:
                        log_error(
                            "Error with document: {0}, doc metric {1} has null value '{2}', assuming a value of '{3}'"
                            .format(str(doc), doc_metric, value,
                                    OPENTSDB_STORED_VALUES_AS_NULL), debug)

        return timeseries_list
    except (ValueError, KeyError) as e:
        log_error(
            "Error {0} {1} with document: {2} ".format(
                str(e), str(traceback.format_exc()), str(doc)), debug)
        raise
def persist_containers(container_resources_dict):
    # Try to get the containers, if unavailable, return
    # Remote database operation
    containers = get_structures(db_handler, debug, subtype="container")
    if not containers:
        return

    # Retrieve each container resources, persist them and store them to generate host info
    threads = []
    for container in containers:
        # Check that the document has been properly initialized, otherwise it might be overwritten with just
        # the "current" value without possibility of correcting it
        skip = False
        for resource in resources_persisted:
            if resource not in container["resources"] or "max" not in container[
                    "resources"][resource]:
                log_error(
                    "Container {0} has not a proper config for the resource {1}"
                    .format(container["name"], resource), debug)
                skip = True
        if skip:
            continue

        process = Thread(target=thread_persist_container,
                         args=(
                             container,
                             container_resources_dict,
                         ))
        process.start()
        threads.append(process)

    for process in threads:
        process.join()
Example #3
0
def main():
    try:
        guardian = Guardian()
        guardian.guard()
    except Exception as e:
        log_error("{0} {1}".format(str(e), str(traceback.format_exc())),
                  debug=True)
Example #4
0
def main():
    try:
        refeeder = ReFeeder()
        refeeder.refeed()
    except Exception as e:
        log_error("{0} {1}".format(str(e), str(traceback.format_exc())),
                  debug=True)
Example #5
0
    def adjust_container_state(self, resources, limits, resources_to_adjust):
        for resource in resources_to_adjust:
            if "boundary" not in limits[resource]:
                raise RuntimeError(
                    "Missing boundary value for resource {0}".format(resource))
            if "current" not in resources[resource]:
                raise RuntimeError(
                    "Missing current value for resource {0}".format(resource))

            n_loop, errors = 0, True
            while errors:
                n_loop += 1
                try:
                    self.check_invalid_container_state(resources, limits,
                                                       resource)
                    errors = False
                except ValueError:
                    # Correct the chain current > upper > lower, including boundary between current and upper
                    boundary = int(limits[resource]["boundary"])
                    limits[resource]["upper"] = int(
                        resources[resource]["current"] - boundary)
                    limits[resource]["lower"] = int(limits[resource]["upper"] -
                                                    boundary)
                except RuntimeError as e:
                    log_error(str(e), self.debug)
                    raise e
                if n_loop >= 10:
                    message = "Limits for {0} can't be adjusted, check the configuration (max:{1},current:{2}, boundary:{3}, min:{4})".format(
                        resource, resources[resource]["max"],
                        int(resources[resource]["current"]),
                        limits[resource]["boundary"],
                        resources[resource]["min"])
                    raise RuntimeError(message)
                    # TODO This prevents from checking other resources
        return limits
def update_container_current_values(container_name, resources):
    # Remote database operation
    database_structure = db_handler.get_structure(container_name)
    structure = database_structure.copy()

    if not "resources" in structure:
        structure["resources"] = dict()

    for resource in resources_persisted:

        if resource not in structure["resources"]:
            structure["resources"][resource] = dict()

        if resource not in resources or not resources[resource]:
            log_error(
                "Unable to get info for resource {0} for container {1}".format(
                    resource, container_name), debug)
            structure["resources"][resource]["current"] = 0
        else:
            structure["resources"][resource]["current"] = resources[resource][
                translate_map[resource]["limit_label"]]

        structure["resources"][resource]["current"] = int(
            structure["resources"][resource]["current"])

    # Remote database operation
    update_structure(structure, db_handler, debug)
def send_data(docs):
    num_sent_docs = 0
    if docs:
        # Remote database operation
        success, info = opentsdb_handler.send_json_documents(docs)
        if not success:
            log_error(
                "Couldn't properly post documents, error : {0}".format(
                    json.dumps(info["error"])), debug)
        else:
            num_sent_docs = len(docs)
    return num_sent_docs
    def rebalance_containers(self, config):
        self.__config = config
        self.__debug = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "DEBUG")

        log_info("_______________", self.__debug)
        log_info("Performing CONTAINER CPU Balancing", self.__debug)

        # Get the containers and applications
        try:
            applications = get_structures(self.__couchdb_handler, self.__debug, subtype="application")
            containers = get_structures(self.__couchdb_handler, self.__debug, subtype="container")
        except requests.exceptions.HTTPError as e:
            log_error("Couldn't get applications", self.__debug)
            log_error(str(e), self.__debug)
            return

        # Filter out the ones that do not accept rebalancing or that do not need any internal rebalancing
        rebalanceable_apps = list()
        for app in applications:
            # TODO Improve this management
            if "rebalance" not in app or app["rebalance"] == True:
                pass
            else:
                continue
            if len(app["containers"]) <= 1:
                continue

            if self.__app_containers_can_be_rebalanced(app):
                rebalanceable_apps.append(app)

        # Sort them according to each application they belong
        app_containers = dict()
        for app in rebalanceable_apps:
            app_name = app["name"]
            app_containers[app_name] = list()
            app_containers_names = app["containers"]
            for container in containers:
                if container["name"] in app_containers_names:
                    app_containers[app_name].append(container)
            # Get the container usages
            app_containers[app_name] = self.__fill_containers_with_usage_info(app_containers[app_name])

        # Rebalance applications
        for app in rebalanceable_apps:
            app_name = app["name"]
            log_info("Going to rebalance {0} now".format(app_name), self.__debug)
            self.__rebalance_containers_by_pair_swapping(app_containers[app_name], app_name)

        log_info("_______________", self.__debug)
def thread_persist_container(container, container_resources_dict):
    container_name = container["name"]

    # Try to get the container resources, if unavailable, continue with others
    # Remote operation
    # resources = MyUtils.get_container_resources(container, rescaler_http_session, debug)
    resources = container_resources_dict[container_name]["resources"]
    if not resources:
        log_error(
            "Couldn't get container's {0} resources".format(container_name),
            debug)
        return

    # Persist by updating the Database current value
    update_container_current_values(container_name, resources)
def persist_applications(container_resources_dict):
    # Try to get the applications, if unavailable, return
    applications = get_structures(db_handler, debug, subtype="application")
    if not applications:
        return

    # Generate the applications current resource values
    for app in applications:
        for resource in resources_persisted:
            if resource not in app["resources"]:
                log_error(
                    "Application {0} is missing info of resource {1}".format(
                        app["name"], resource), debug)
            else:
                app["resources"][resource]["current"] = 0

        application_containers = app["containers"]
        for container_name in application_containers:

            if container_name not in container_resources_dict:
                log_error(
                    "Container info {0} is missing for app : {1}, app info will not be totally accurate"
                    .format(container_name, app["name"]), debug)
                continue

            for resource in resources_persisted:
                try:
                    container_resources = container_resources_dict[
                        container_name]["resources"]
                    if resource not in container_resources or not container_resources[
                            resource]:
                        log_error(
                            "Unable to get info for resource {0} for container {1} when computing app {2} resources"
                            .format(resource, container_name,
                                    app["name"]), debug)
                    else:
                        current_resource_label = translate_map[resource][
                            "limit_label"]
                        app["resources"][resource][
                            "current"] += container_resources[resource][
                                current_resource_label]
                except KeyError:
                    if "name" in container_resources_dict[
                            container_name] and "name" in app:
                        log_error(
                            "Container info {0} is missing for app: {1} and resource {2} resource,"
                            .format(container_name, app["name"], resource) +
                            " app info will not be totally accurate", debug)

        # Remote database operation
        update_structure(app, db_handler, debug)
Example #11
0
    def get_container_usages(self, container_name):
        try:
            container_info = self.opentsdb_handler.get_structure_timeseries(
                {"host": container_name}, self.window_difference,
                self.window_delay, BDWATCHDOG_METRICS,
                REFEEDER_APPLICATION_METRICS)

            for metric in REFEEDER_APPLICATION_METRICS:
                if metric not in CONFIG_DEFAULT_VALUES["GENERATED_METRICS"]:
                    continue
                if container_info[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE:
                    log_warning(
                        "No metric info for {0} in container {1}".format(
                            metric, container_name),
                        debug=True)

        except requests.ConnectionError as e:
            log_error("Connection error: {0} {1}".format(
                str(e), str(traceback.format_exc())),
                      debug=True)
            raise e
        return container_info
    def __get_container_usages(self, container):
        window_difference = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_TIMELAPSE")
        window_delay = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_DELAY")

        try:
            # Remote database operation
            usages = self.__opentsdb_handler.get_structure_timeseries({"host": container["name"]},
                                                                      window_difference,
                                                                      window_delay,
                                                                      BDWATCHDOG_CONTAINER_METRICS,
                                                                      GUARDIAN_CONTAINER_METRICS)

            # Skip this structure if all the usage metrics are unavailable
            if all([usages[metric] == self.__NO_METRIC_DATA_DEFAULT_VALUE for metric in usages]):
                log_warning("container: {0} has no usage data".format(container["name"]), self.__debug)
                return None

            return usages
        except Exception as e:
            log_error("error with structure: {0} {1} {2}".format(container["name"], str(e), str(traceback.format_exc())),
                      self.__debug)

            return None
Example #13
0
    def refeed_user_used_energy(self, applications, users, db_handler, debug):
        for user in users:
            if "cpu" not in user:
                user["cpu"] = {}
            if "energy" not in user:
                user["energy"] = {}
            total_user = {"cpu": 0, "energy": 0}
            total_user_current_cpu = 0
            user_apps = get_user_apps(applications, user)
            for app in user_apps:
                for resource in ["energy", "cpu"]:
                    if "usage" in app["resources"][resource] and app[
                            "resources"][resource]["usage"]:
                        total_user[resource] += app["resources"][resource][
                            "usage"]
                    else:
                        log_error(
                            "Application {0} of user {1} has no used {2} field or value"
                            .format(app["name"], user["name"],
                                    resource), debug)

                if "current" in app["resources"]["cpu"] and app["resources"][
                        "cpu"]["usage"]:
                    total_user_current_cpu += app["resources"][resource][
                        "current"]
                else:
                    log_error(
                        "Application {0} of user {1} has no current cpu field or value"
                        .format(app["name"], user["name"]), debug)

            user["energy"]["used"] = total_user["energy"]
            user["cpu"]["usage"] = total_user["cpu"]
            user["cpu"]["current"] = total_user_current_cpu
            db_handler.update_user(user)
            log_info(
                "Updated energy consumed by user {0}".format(user["name"]),
                debug)
def main():
    try:
        persist()
    except Exception as e:
        log_error("{0} {1}".format(str(e), str(traceback.format_exc())),
                  debug=True)
def persist():
    logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

    global debug

    myConfig = MyConfig(CONFIG_DEFAULT_VALUES)

    while True:
        log_info("----------------------", debug)
        log_info("Starting Epoch", debug)
        t0 = time.time()

        # Get service info
        service = get_service(db_handler,
                              SERVICE_NAME)  # Remote database operation

        # Heartbeat
        beat(db_handler, SERVICE_NAME)  # Remote database operation

        # CONFIG
        myConfig.set_config(service["config"])
        polling_frequency = myConfig.get_value("POLLING_FREQUENCY")
        debug = myConfig.get_value("DEBUG")
        documents_persisted = myConfig.get_value("DOCUMENTS_PERSISTED")
        SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")

        log_info("Config is as follows:", debug)
        log_info(".............................................", debug)
        log_info("Polling frequency -> {0}".format(polling_frequency), debug)
        log_info(
            "Documents to be persisted are -> {0}".format(documents_persisted),
            debug)
        log_info(".............................................", debug)

        ## CHECK INVALID CONFIG ##
        # TODO THis code is duplicated on the structures and database snapshoters
        invalid, message = invalid_conf(myConfig)
        if invalid:
            log_error(message, debug)
            time.sleep(polling_frequency)
            if polling_frequency < 4:
                log_error(
                    "Polling frequency is too short, replacing with DEFAULT value '{0}'"
                    .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug)
                polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]

            log_info("----------------------\n", debug)
            time.sleep(polling_frequency)
            continue

        if SERVICE_IS_ACTIVATED:
            for docType in documents_persisted:
                persist_docs(docType)
        else:
            log_warning(
                "Database snapshoter is not activated, will not do anything",
                debug)

        t1 = time.time()
        log_info("Epoch processed in {0} seconds ".format("%.2f" % (t1 - t0)),
                 debug)
        log_info("----------------------\n", debug)

        time.sleep(polling_frequency)
Example #16
0
    def guard(self, ):
        myConfig = MyConfig(CONFIG_DEFAULT_VALUES)
        logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

        while True:
            # Get service info
            service = get_service(self.couchdb_handler, SERVICE_NAME)

            # Heartbeat
            beat(self.couchdb_handler, SERVICE_NAME)

            # CONFIG
            myConfig.set_config(service["config"])
            self.debug = myConfig.get_value("DEBUG")
            debug = self.debug
            self.guardable_resources = myConfig.get_value(
                "GUARDABLE_RESOURCES")
            self.cpu_shares_per_watt = myConfig.get_value(
                "CPU_SHARES_PER_WATT")
            self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE")
            self.window_delay = myConfig.get_value("WINDOW_DELAY")
            self.structure_guarded = myConfig.get_value("STRUCTURE_GUARDED")
            self.event_timeout = myConfig.get_value("EVENT_TIMEOUT")
            SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")

            t0 = start_epoch(self.debug)

            log_info("Config is as follows:", debug)
            log_info(".............................................", debug)
            log_info("Time window lapse -> {0}".format(self.window_difference),
                     debug)
            log_info("Delay -> {0}".format(self.window_delay), debug)
            log_info("Event timeout -> {0}".format(self.event_timeout), debug)
            log_info(
                "Resources guarded are -> {0}".format(
                    self.guardable_resources), debug)
            log_info(
                "Structure type guarded is -> {0}".format(
                    self.structure_guarded), debug)
            log_info(".............................................", debug)

            ## CHECK INVALID CONFIG ##
            invalid, message = self.invalid_conf()
            if invalid:
                log_error(message, debug)
                if self.window_difference < 5:
                    log_error(
                        "Window difference is too short, replacing with DEFAULT value '{0}'"
                        .format(CONFIG_DEFAULT_VALUES["WINDOW_TIMELAPSE"]),
                        self.debug)
                    self.window_difference = CONFIG_DEFAULT_VALUES[
                        "WINDOW_TIMELAPSE"]
                time.sleep(self.window_difference)
                end_epoch(self.debug, self.window_difference, t0)
                continue

            thread = None
            if SERVICE_IS_ACTIVATED:
                # Remote database operation
                structures = get_structures(self.couchdb_handler,
                                            debug,
                                            subtype=self.structure_guarded)
                if structures:
                    log_info(
                        "{0} Structures to process, launching threads".format(
                            len(structures)), debug)
                    thread = Thread(name="guard_structures",
                                    target=self.guard_structures,
                                    args=(structures, ))
                    thread.start()
                else:
                    log_info("No structures to process", debug)
            else:
                log_warning("Guardian is not activated", debug)

            time.sleep(self.window_difference)

            wait_operation_thread(thread, debug)

            end_epoch(t0, self.window_difference, t0)
Example #17
0
    def serverless(self, structure, rules):
        structure_subtype = structure["subtype"]

        # Check if structure is guarded
        if "guard" not in structure or not structure["guard"]:
            log_warning(
                "structure: {0} is set to leave alone, skipping".format(
                    structure["name"]), self.debug)
            return

        # Check if the structure has any resource set to guarded
        struct_guarded_resources = list()
        for res in self.guardable_resources:
            if res in structure["resources"] and "guard" in structure[
                    "resources"][res] and structure["resources"][res]["guard"]:
                struct_guarded_resources.append(res)
        if not struct_guarded_resources:
            log_warning(
                "Structure {0} is set to guarded but has no resource marked to guard"
                .format(structure["name"]), self.debug)
            return

        # Check if structure is being monitored, otherwise, ignore
        if structure_subtype not in BDWATCHDOG_METRICS or structure_subtype not in GUARDIAN_METRICS or structure_subtype not in TAGS:
            log_error(
                "Unknown structure subtype '{0}'".format(structure_subtype),
                self.debug)
            return

        try:
            metrics_to_retrieve = list()
            metrics_to_generate = dict()
            for res in struct_guarded_resources:
                metrics_to_retrieve += BDWATCHDOG_METRICS[structure_subtype][
                    res]
                metrics_to_generate[generate_structure_usage_metric(
                    res)] = GUARDIAN_METRICS[structure_subtype][
                        generate_structure_usage_metric(res)]
            tag = TAGS[structure_subtype]

            # Remote database operation
            usages = self.opentsdb_handler.get_structure_timeseries(
                {tag: structure["name"]}, self.window_difference,
                self.window_delay, metrics_to_retrieve, metrics_to_generate)

            for metric in usages:
                if usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE:
                    log_warning(
                        "structure: {0} has no usage data for {1}".format(
                            structure["name"], metric), self.debug)

            # Skip this structure if all the usage metrics are unavailable
            if all([
                    usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE
                    for metric in usages
            ]):
                log_warning(
                    "structure: {0} has no usage data for any metric, skipping"
                    .format(structure["name"]), self.debug)
                return

            resources = structure["resources"]

            # Remote database operation
            limits = self.couchdb_handler.get_limits(structure)
            limits_resources = limits["resources"]

            if not limits_resources:
                log_warning(
                    "structure: {0} has no limits".format(structure["name"]),
                    self.debug)
                return

            # Adjust the structure limits according to the current value
            limits["resources"] = self.adjust_container_state(
                resources, limits_resources, self.guardable_resources)

            # Remote database operation
            self.couchdb_handler.update_limit(limits)

            self.process_serverless_structure(structure, usages,
                                              limits_resources, rules)

        except Exception as e:
            log_error(
                "Error with structure {0}: {1}".format(structure["name"],
                                                       str(e)), self.debug)
def persist():
    logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO)

    global resources_persisted
    global debug

    myConfig = MyConfig(CONFIG_DEFAULT_VALUES)

    while True:
        log_info("----------------------", debug)
        log_info("Starting Epoch", debug)
        t0 = time.time()

        # Get service info
        service = get_service(db_handler,
                              SERVICE_NAME)  # Remote database operation

        # Heartbeat
        beat(db_handler, SERVICE_NAME)  # Remote database operation

        # CONFIG
        myConfig.set_config(service["config"])
        polling_frequency = myConfig.get_value("POLLING_FREQUENCY")
        debug = myConfig.get_value("DEBUG")
        resources_persisted = myConfig.get_value("RESOURCES_PERSISTED")
        SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE")
        log_info(
            "Going to snapshot resources: {0}".format(resources_persisted),
            debug)

        log_info("Config is as follows:", debug)
        log_info(".............................................", debug)
        log_info("Polling frequency -> {0}".format(polling_frequency), debug)
        log_info(
            "Resources to be snapshoter are -> {0}".format(
                resources_persisted), debug)
        log_info(".............................................", debug)

        ## CHECK INVALID CONFIG ##
        # TODO This code is duplicated on the structures and database snapshoters
        invalid, message = invalid_conf(myConfig)
        if invalid:
            log_error(message, debug)
            time.sleep(polling_frequency)
            if polling_frequency < 3:
                log_error(
                    "Polling frequency is too short, replacing with DEFAULT value '{0}'"
                    .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug)
                polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]

            log_info("----------------------\n", debug)
            time.sleep(polling_frequency)
            continue

        thread = None
        if SERVICE_IS_ACTIVATED:
            thread = Thread(target=persist_thread, args=())
            thread.start()
        else:
            log_warning(
                "Structure snapshoter is not activated, will not do anything",
                debug)

        time.sleep(polling_frequency)

        wait_operation_thread(thread, debug)

        t1 = time.time()
        time_proc = "%.2f" % (t1 - t0 - polling_frequency)
        time_total = "%.2f" % (t1 - t0)
        log_info(
            "Epoch processed in {0} seconds ({1} processing and {2} sleeping)".
            format(time_total, time_proc, str(polling_frequency)), debug)
        log_info("----------------------\n", debug)