def translate_structure_doc_to_timeseries(doc): try: struct_name = doc["name"] timestamp = int(time.time()) timeseries_list = list() for resource in doc["resources"]: for doc_metric in doc["resources"][resource]: if doc_metric in PERSIST_METRICS and doc_metric in doc[ "resources"][resource]: value = doc["resources"][resource][doc_metric] if value or value == 0: metric = ".".join([doc["type"], resource, doc_metric]) timeseries = dict(metric=metric, value=value, timestamp=timestamp, tags={"structure": struct_name}) timeseries_list.append(timeseries) else: log_error( "Error with document: {0}, doc metric {1} has null value '{2}', assuming a value of '{3}'" .format(str(doc), doc_metric, value, OPENTSDB_STORED_VALUES_AS_NULL), debug) return timeseries_list except (ValueError, KeyError) as e: log_error( "Error {0} {1} with document: {2} ".format( str(e), str(traceback.format_exc()), str(doc)), debug) raise
def persist_containers(container_resources_dict): # Try to get the containers, if unavailable, return # Remote database operation containers = get_structures(db_handler, debug, subtype="container") if not containers: return # Retrieve each container resources, persist them and store them to generate host info threads = [] for container in containers: # Check that the document has been properly initialized, otherwise it might be overwritten with just # the "current" value without possibility of correcting it skip = False for resource in resources_persisted: if resource not in container["resources"] or "max" not in container[ "resources"][resource]: log_error( "Container {0} has not a proper config for the resource {1}" .format(container["name"], resource), debug) skip = True if skip: continue process = Thread(target=thread_persist_container, args=( container, container_resources_dict, )) process.start() threads.append(process) for process in threads: process.join()
def main(): try: guardian = Guardian() guardian.guard() except Exception as e: log_error("{0} {1}".format(str(e), str(traceback.format_exc())), debug=True)
def main(): try: refeeder = ReFeeder() refeeder.refeed() except Exception as e: log_error("{0} {1}".format(str(e), str(traceback.format_exc())), debug=True)
def adjust_container_state(self, resources, limits, resources_to_adjust): for resource in resources_to_adjust: if "boundary" not in limits[resource]: raise RuntimeError( "Missing boundary value for resource {0}".format(resource)) if "current" not in resources[resource]: raise RuntimeError( "Missing current value for resource {0}".format(resource)) n_loop, errors = 0, True while errors: n_loop += 1 try: self.check_invalid_container_state(resources, limits, resource) errors = False except ValueError: # Correct the chain current > upper > lower, including boundary between current and upper boundary = int(limits[resource]["boundary"]) limits[resource]["upper"] = int( resources[resource]["current"] - boundary) limits[resource]["lower"] = int(limits[resource]["upper"] - boundary) except RuntimeError as e: log_error(str(e), self.debug) raise e if n_loop >= 10: message = "Limits for {0} can't be adjusted, check the configuration (max:{1},current:{2}, boundary:{3}, min:{4})".format( resource, resources[resource]["max"], int(resources[resource]["current"]), limits[resource]["boundary"], resources[resource]["min"]) raise RuntimeError(message) # TODO This prevents from checking other resources return limits
def update_container_current_values(container_name, resources): # Remote database operation database_structure = db_handler.get_structure(container_name) structure = database_structure.copy() if not "resources" in structure: structure["resources"] = dict() for resource in resources_persisted: if resource not in structure["resources"]: structure["resources"][resource] = dict() if resource not in resources or not resources[resource]: log_error( "Unable to get info for resource {0} for container {1}".format( resource, container_name), debug) structure["resources"][resource]["current"] = 0 else: structure["resources"][resource]["current"] = resources[resource][ translate_map[resource]["limit_label"]] structure["resources"][resource]["current"] = int( structure["resources"][resource]["current"]) # Remote database operation update_structure(structure, db_handler, debug)
def send_data(docs): num_sent_docs = 0 if docs: # Remote database operation success, info = opentsdb_handler.send_json_documents(docs) if not success: log_error( "Couldn't properly post documents, error : {0}".format( json.dumps(info["error"])), debug) else: num_sent_docs = len(docs) return num_sent_docs
def rebalance_containers(self, config): self.__config = config self.__debug = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "DEBUG") log_info("_______________", self.__debug) log_info("Performing CONTAINER CPU Balancing", self.__debug) # Get the containers and applications try: applications = get_structures(self.__couchdb_handler, self.__debug, subtype="application") containers = get_structures(self.__couchdb_handler, self.__debug, subtype="container") except requests.exceptions.HTTPError as e: log_error("Couldn't get applications", self.__debug) log_error(str(e), self.__debug) return # Filter out the ones that do not accept rebalancing or that do not need any internal rebalancing rebalanceable_apps = list() for app in applications: # TODO Improve this management if "rebalance" not in app or app["rebalance"] == True: pass else: continue if len(app["containers"]) <= 1: continue if self.__app_containers_can_be_rebalanced(app): rebalanceable_apps.append(app) # Sort them according to each application they belong app_containers = dict() for app in rebalanceable_apps: app_name = app["name"] app_containers[app_name] = list() app_containers_names = app["containers"] for container in containers: if container["name"] in app_containers_names: app_containers[app_name].append(container) # Get the container usages app_containers[app_name] = self.__fill_containers_with_usage_info(app_containers[app_name]) # Rebalance applications for app in rebalanceable_apps: app_name = app["name"] log_info("Going to rebalance {0} now".format(app_name), self.__debug) self.__rebalance_containers_by_pair_swapping(app_containers[app_name], app_name) log_info("_______________", self.__debug)
def thread_persist_container(container, container_resources_dict): container_name = container["name"] # Try to get the container resources, if unavailable, continue with others # Remote operation # resources = MyUtils.get_container_resources(container, rescaler_http_session, debug) resources = container_resources_dict[container_name]["resources"] if not resources: log_error( "Couldn't get container's {0} resources".format(container_name), debug) return # Persist by updating the Database current value update_container_current_values(container_name, resources)
def persist_applications(container_resources_dict): # Try to get the applications, if unavailable, return applications = get_structures(db_handler, debug, subtype="application") if not applications: return # Generate the applications current resource values for app in applications: for resource in resources_persisted: if resource not in app["resources"]: log_error( "Application {0} is missing info of resource {1}".format( app["name"], resource), debug) else: app["resources"][resource]["current"] = 0 application_containers = app["containers"] for container_name in application_containers: if container_name not in container_resources_dict: log_error( "Container info {0} is missing for app : {1}, app info will not be totally accurate" .format(container_name, app["name"]), debug) continue for resource in resources_persisted: try: container_resources = container_resources_dict[ container_name]["resources"] if resource not in container_resources or not container_resources[ resource]: log_error( "Unable to get info for resource {0} for container {1} when computing app {2} resources" .format(resource, container_name, app["name"]), debug) else: current_resource_label = translate_map[resource][ "limit_label"] app["resources"][resource][ "current"] += container_resources[resource][ current_resource_label] except KeyError: if "name" in container_resources_dict[ container_name] and "name" in app: log_error( "Container info {0} is missing for app: {1} and resource {2} resource," .format(container_name, app["name"], resource) + " app info will not be totally accurate", debug) # Remote database operation update_structure(app, db_handler, debug)
def get_container_usages(self, container_name): try: container_info = self.opentsdb_handler.get_structure_timeseries( {"host": container_name}, self.window_difference, self.window_delay, BDWATCHDOG_METRICS, REFEEDER_APPLICATION_METRICS) for metric in REFEEDER_APPLICATION_METRICS: if metric not in CONFIG_DEFAULT_VALUES["GENERATED_METRICS"]: continue if container_info[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE: log_warning( "No metric info for {0} in container {1}".format( metric, container_name), debug=True) except requests.ConnectionError as e: log_error("Connection error: {0} {1}".format( str(e), str(traceback.format_exc())), debug=True) raise e return container_info
def __get_container_usages(self, container): window_difference = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_TIMELAPSE") window_delay = get_config_value(self.__config, CONFIG_DEFAULT_VALUES, "WINDOW_DELAY") try: # Remote database operation usages = self.__opentsdb_handler.get_structure_timeseries({"host": container["name"]}, window_difference, window_delay, BDWATCHDOG_CONTAINER_METRICS, GUARDIAN_CONTAINER_METRICS) # Skip this structure if all the usage metrics are unavailable if all([usages[metric] == self.__NO_METRIC_DATA_DEFAULT_VALUE for metric in usages]): log_warning("container: {0} has no usage data".format(container["name"]), self.__debug) return None return usages except Exception as e: log_error("error with structure: {0} {1} {2}".format(container["name"], str(e), str(traceback.format_exc())), self.__debug) return None
def refeed_user_used_energy(self, applications, users, db_handler, debug): for user in users: if "cpu" not in user: user["cpu"] = {} if "energy" not in user: user["energy"] = {} total_user = {"cpu": 0, "energy": 0} total_user_current_cpu = 0 user_apps = get_user_apps(applications, user) for app in user_apps: for resource in ["energy", "cpu"]: if "usage" in app["resources"][resource] and app[ "resources"][resource]["usage"]: total_user[resource] += app["resources"][resource][ "usage"] else: log_error( "Application {0} of user {1} has no used {2} field or value" .format(app["name"], user["name"], resource), debug) if "current" in app["resources"]["cpu"] and app["resources"][ "cpu"]["usage"]: total_user_current_cpu += app["resources"][resource][ "current"] else: log_error( "Application {0} of user {1} has no current cpu field or value" .format(app["name"], user["name"]), debug) user["energy"]["used"] = total_user["energy"] user["cpu"]["usage"] = total_user["cpu"] user["cpu"]["current"] = total_user_current_cpu db_handler.update_user(user) log_info( "Updated energy consumed by user {0}".format(user["name"]), debug)
def main(): try: persist() except Exception as e: log_error("{0} {1}".format(str(e), str(traceback.format_exc())), debug=True)
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") documents_persisted = myConfig.get_value("DOCUMENTS_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Documents to be persisted are -> {0}".format(documents_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO THis code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 4: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue if SERVICE_IS_ACTIVATED: for docType in documents_persisted: persist_docs(docType) else: log_warning( "Database snapshoter is not activated, will not do anything", debug) t1 = time.time() log_info("Epoch processed in {0} seconds ".format("%.2f" % (t1 - t0)), debug) log_info("----------------------\n", debug) time.sleep(polling_frequency)
def guard(self, ): myConfig = MyConfig(CONFIG_DEFAULT_VALUES) logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) while True: # Get service info service = get_service(self.couchdb_handler, SERVICE_NAME) # Heartbeat beat(self.couchdb_handler, SERVICE_NAME) # CONFIG myConfig.set_config(service["config"]) self.debug = myConfig.get_value("DEBUG") debug = self.debug self.guardable_resources = myConfig.get_value( "GUARDABLE_RESOURCES") self.cpu_shares_per_watt = myConfig.get_value( "CPU_SHARES_PER_WATT") self.window_difference = myConfig.get_value("WINDOW_TIMELAPSE") self.window_delay = myConfig.get_value("WINDOW_DELAY") self.structure_guarded = myConfig.get_value("STRUCTURE_GUARDED") self.event_timeout = myConfig.get_value("EVENT_TIMEOUT") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") t0 = start_epoch(self.debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Time window lapse -> {0}".format(self.window_difference), debug) log_info("Delay -> {0}".format(self.window_delay), debug) log_info("Event timeout -> {0}".format(self.event_timeout), debug) log_info( "Resources guarded are -> {0}".format( self.guardable_resources), debug) log_info( "Structure type guarded is -> {0}".format( self.structure_guarded), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## invalid, message = self.invalid_conf() if invalid: log_error(message, debug) if self.window_difference < 5: log_error( "Window difference is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["WINDOW_TIMELAPSE"]), self.debug) self.window_difference = CONFIG_DEFAULT_VALUES[ "WINDOW_TIMELAPSE"] time.sleep(self.window_difference) end_epoch(self.debug, self.window_difference, t0) continue thread = None if SERVICE_IS_ACTIVATED: # Remote database operation structures = get_structures(self.couchdb_handler, debug, subtype=self.structure_guarded) if structures: log_info( "{0} Structures to process, launching threads".format( len(structures)), debug) thread = Thread(name="guard_structures", target=self.guard_structures, args=(structures, )) thread.start() else: log_info("No structures to process", debug) else: log_warning("Guardian is not activated", debug) time.sleep(self.window_difference) wait_operation_thread(thread, debug) end_epoch(t0, self.window_difference, t0)
def serverless(self, structure, rules): structure_subtype = structure["subtype"] # Check if structure is guarded if "guard" not in structure or not structure["guard"]: log_warning( "structure: {0} is set to leave alone, skipping".format( structure["name"]), self.debug) return # Check if the structure has any resource set to guarded struct_guarded_resources = list() for res in self.guardable_resources: if res in structure["resources"] and "guard" in structure[ "resources"][res] and structure["resources"][res]["guard"]: struct_guarded_resources.append(res) if not struct_guarded_resources: log_warning( "Structure {0} is set to guarded but has no resource marked to guard" .format(structure["name"]), self.debug) return # Check if structure is being monitored, otherwise, ignore if structure_subtype not in BDWATCHDOG_METRICS or structure_subtype not in GUARDIAN_METRICS or structure_subtype not in TAGS: log_error( "Unknown structure subtype '{0}'".format(structure_subtype), self.debug) return try: metrics_to_retrieve = list() metrics_to_generate = dict() for res in struct_guarded_resources: metrics_to_retrieve += BDWATCHDOG_METRICS[structure_subtype][ res] metrics_to_generate[generate_structure_usage_metric( res)] = GUARDIAN_METRICS[structure_subtype][ generate_structure_usage_metric(res)] tag = TAGS[structure_subtype] # Remote database operation usages = self.opentsdb_handler.get_structure_timeseries( {tag: structure["name"]}, self.window_difference, self.window_delay, metrics_to_retrieve, metrics_to_generate) for metric in usages: if usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE: log_warning( "structure: {0} has no usage data for {1}".format( structure["name"], metric), self.debug) # Skip this structure if all the usage metrics are unavailable if all([ usages[metric] == self.NO_METRIC_DATA_DEFAULT_VALUE for metric in usages ]): log_warning( "structure: {0} has no usage data for any metric, skipping" .format(structure["name"]), self.debug) return resources = structure["resources"] # Remote database operation limits = self.couchdb_handler.get_limits(structure) limits_resources = limits["resources"] if not limits_resources: log_warning( "structure: {0} has no limits".format(structure["name"]), self.debug) return # Adjust the structure limits according to the current value limits["resources"] = self.adjust_container_state( resources, limits_resources, self.guardable_resources) # Remote database operation self.couchdb_handler.update_limit(limits) self.process_serverless_structure(structure, usages, limits_resources, rules) except Exception as e: log_error( "Error with structure {0}: {1}".format(structure["name"], str(e)), self.debug)
def persist(): logging.basicConfig(filename=SERVICE_NAME + '.log', level=logging.INFO) global resources_persisted global debug myConfig = MyConfig(CONFIG_DEFAULT_VALUES) while True: log_info("----------------------", debug) log_info("Starting Epoch", debug) t0 = time.time() # Get service info service = get_service(db_handler, SERVICE_NAME) # Remote database operation # Heartbeat beat(db_handler, SERVICE_NAME) # Remote database operation # CONFIG myConfig.set_config(service["config"]) polling_frequency = myConfig.get_value("POLLING_FREQUENCY") debug = myConfig.get_value("DEBUG") resources_persisted = myConfig.get_value("RESOURCES_PERSISTED") SERVICE_IS_ACTIVATED = myConfig.get_value("ACTIVE") log_info( "Going to snapshot resources: {0}".format(resources_persisted), debug) log_info("Config is as follows:", debug) log_info(".............................................", debug) log_info("Polling frequency -> {0}".format(polling_frequency), debug) log_info( "Resources to be snapshoter are -> {0}".format( resources_persisted), debug) log_info(".............................................", debug) ## CHECK INVALID CONFIG ## # TODO This code is duplicated on the structures and database snapshoters invalid, message = invalid_conf(myConfig) if invalid: log_error(message, debug) time.sleep(polling_frequency) if polling_frequency < 3: log_error( "Polling frequency is too short, replacing with DEFAULT value '{0}'" .format(CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"]), debug) polling_frequency = CONFIG_DEFAULT_VALUES["POLLING_FREQUENCY"] log_info("----------------------\n", debug) time.sleep(polling_frequency) continue thread = None if SERVICE_IS_ACTIVATED: thread = Thread(target=persist_thread, args=()) thread.start() else: log_warning( "Structure snapshoter is not activated, will not do anything", debug) time.sleep(polling_frequency) wait_operation_thread(thread, debug) t1 = time.time() time_proc = "%.2f" % (t1 - t0 - polling_frequency) time_total = "%.2f" % (t1 - t0) log_info( "Epoch processed in {0} seconds ({1} processing and {2} sleeping)". format(time_total, time_proc, str(polling_frequency)), debug) log_info("----------------------\n", debug)