class KubejobsController(Controller): def __init__(self, application_id, parameters): self.validate(parameters["control_parameters"]) self.logger = ScalingLog( "diff.controller.log", "controller.log", application_id) self.application_id = application_id parameters.update({"app_id": application_id}) # read scaling parameters self.check_interval = \ parameters["control_parameters"]["check_interval"] # We use a lock here to prevent race conditions when stopping the # controller self.running = True self.running_lock = threading.RLock() # The alarm here is responsible for deciding whether to scale up or # down, or even do nothing self.alarm = KubeJobs(parameters) def start_application_scaling(self): run = True self.logger.log("Start to control resources") while run: self.logger.log("Monitoring application: %s" % self.application_id) # Call the alarm to check the application self.alarm.check_application_state() # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status() def validate(self, data): data_model = { "actuator": six.string_types, "check_interval": int, "metric_source": six.string_types, "schedule_strategy": six.string_types } for key in data_model: if (key not in data): raise ex.BadRequestException( "Variable \"{}\" is missing".format(key)) if (not isinstance(data[key], data_model[key])): raise ex.BadRequestException( "\"{}\" has unexpected variable type: {}. Was expecting {}" .format(key, type(data[key]), data_model[key]))
class KubejobsController(Controller): def __init__(self, application_id, parameters): self.logger = ScalingLog( "diff.controller.log", "controller.log", application_id) scaling_parameters = parameters["control_parameters"] self.application_id = application_id parameters.update({"app_id": application_id}) # read scaling parameters self.check_interval = scaling_parameters["check_interval"] self.trigger_down = scaling_parameters["trigger_down"] self.trigger_up = scaling_parameters["trigger_up"] self.min_cap = scaling_parameters["min_rep"] self.max_cap = scaling_parameters["max_rep"] self.actuation_size = scaling_parameters["actuation_size"] # The actuator plugin name self.actuator_type = scaling_parameters["actuator"] # The metric source plugin name self.metric_source_type = scaling_parameters["metric_source"] # We use a lock here to prevent race conditions when stopping the controller self.running = True self.running_lock = threading.RLock() # Gets a new metric source plugin using the given name metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, parameters) # Gets a new actuator plugin using the given name actuator = ActuatorBuilder().get_actuator(self.actuator_type, parameters=parameters) # The alarm here is responsible for deciding whether to scale up or down, or even do nothing self.alarm = KubeJobs(actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.actuation_size, application_id) def start_application_scaling(self): run = True print "Start to control resources" while run: self.logger.log("Monitoring application: %s" % self.application_id) # Call the alarm to check the application self.alarm.check_application_state() # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status()
class PIDController(Controller): def __init__(self, app_id, plugin_info): self.logger = ScalingLog("pid.controller.log", "controller.log", app_id) self.app_id = app_id self.instances = plugin_info["instances"] self.check_interval = plugin_info["check_interval"] self.trigger_down = plugin_info["trigger_down"] self.trigger_up = plugin_info["trigger_up"] self.min_cap = plugin_info["min_cap"] self.max_cap = plugin_info["max_cap"] self.metric_rounding = plugin_info["metric_rounding"] self.actuator_type = plugin_info["actuator"] self.metric_source_type = plugin_info["metric_source"] self.heuristic_options = plugin_info["heuristic_options"] self.running = True self.running_lock = threading.RLock() metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, plugin_info) actuator = ActuatorBuilder().get_actuator(self.actuator_type) self.alarm = PIDAlarm(actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.metric_rounding, self.heuristic_options, self.app_id, self.instances) def start_application_scaling(self): run = True while run: self.logger.log("Monitoring application: %s" % (self.app_id)) try: self.alarm.check_application_state() except MetricNotFoundException: self.logger.log("No metrics available") print "No metrics avaliable" except Exception as e: self.logger.log(str(e)) print "Unknown " + str(e) # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status()
class GenericController(Controller): def __init__(self, application_id, plugin_info): self.logger = ScalingLog("diff.controller.log", "controller.log", application_id) self.application_id = application_id self.instances = plugin_info["instances"] self.check_interval = plugin_info["check_interval"] self.trigger_down = plugin_info["trigger_down"] self.trigger_up = plugin_info["trigger_up"] self.min_cap = plugin_info["min_cap"] self.max_cap = plugin_info["max_cap"] self.actuation_size = plugin_info["actuation_size"] self.metric_rounding = plugin_info["metric_rounding"] self.actuator_type = plugin_info["actuator"] self.metric_source_type = plugin_info["metric_source"] """ We use a lock here to prevent race conditions when stopping the controller """ self.running = True self.running_lock = threading.RLock() # Gets a new metric source plugin using the given name metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, plugin_info) # Gets a new actuator plugin using the given name actuator = ActuatorBuilder().get_actuator(self.actuator_type) """ The alarm here is responsible for deciding whether to scale up or down, or even do nothing """ self.alarm = GenericAlarm(actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.actuation_size, self.metric_rounding, application_id, self.instances) def start_application_scaling(self): run = True while run: self.logger.log("Monitoring application: %s" % (self.application_id)) try: self.alarm.check_application_state() except MetricNotFoundException: self.logger.log("No metrics available") print "No metrics avaliable" except Exception as e: self.logger.log(str(e)) print "Unknown " + str(e) # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status()
class ProportionalDerivativeController(Controller): def __init__(self, application_id, plugin_info): self.logger = ScalingLog("proportional_derivative.controller.log", "controller.log", application_id) plugin_info = plugin_info["plugin_info"] self.application_id = application_id self.instances = plugin_info["instances"] self.check_interval = plugin_info["check_interval"] self.trigger_down = plugin_info["trigger_down"] self.trigger_up = plugin_info["trigger_up"] self.min_cap = plugin_info["min_cap"] self.max_cap = plugin_info["max_cap"] self.metric_rounding = plugin_info["metric_rounding"] self.actuator_type = plugin_info["actuator"] self.metric_source_type = plugin_info["metric_source"] self.heuristic_options = plugin_info["heuristic_options"] self.running = True self.running_lock = threading.RLock() # Gets a new metric source plugin using the given name metric_source = MetricSourceBuilder().get_metric_source( self.metric_source_type, plugin_info) # Gets a new actuator plugin using the given name actuator = ActuatorBuilder().get_actuator(self.actuator_type, plugin_info) """ The alarm here is responsible for deciding whether to scale up or down, or even do nothing """ self.alarm = ProportionalDerivativeAlarm( actuator, metric_source, self.trigger_down, self.trigger_up, self.min_cap, self.max_cap, self.metric_rounding, self.heuristic_options, application_id, self.instances) def start_application_scaling(self): run = True while run: self.logger.log("Monitoring application: %s" % (self.application_id)) # Call the alarm to check the application try: self.alarm.check_application_state() except MetricNotFoundException: self.logger.log("No metrics available") except Exception as e: self.logger.log(str(e)) # Wait some time time.sleep(float(self.check_interval)) with self.running_lock: run = self.running def stop_application_scaling(self): with self.running_lock: self.running = False def status(self): return self.alarm.status()
class ProportionalAlarm: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, metric_rounding, heuristic_options, application_id, instances): # TODO: Check parameters self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.metric_rounding = metric_rounding self.heuristic_options = heuristic_options self.application_id = application_id self.instances = instances self.logger = ScalingLog( "%s.proportional.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % (application_id), "cap.log", application_id) self.last_progress_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ # TODO: Check parameters self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp progress_error_timestamp, progress_error = self._get_progress_error( self.application_id) self.logger.log("Progress error-[%s]-%f" % (str(progress_error_timestamp), progress_error)) self.last_action = "Progress error-[%s]-%f" % ( str(progress_error_timestamp), progress_error) """ Check if the metric is new by comparing the timestamps of the current metric and most recent metric """ if self._check_measurements_are_new(progress_error_timestamp): self._scale_down(progress_error, self.instances) self._scale_up(progress_error, self.instances) if self.cap != -1: self.cap_logger.log("%.0f|%s|%s" % ( time.time(), str(self.application_id), str(self.cap))) self.last_progress_error_timestamp = progress_error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") def _scale_down(self, progress_error, instances): """ Checks if it is necessary to scale down, according to the progress_error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If error is positive and its absolute value is too high, scale down if progress_error > 0 and progress_error >= self.trigger_down: self.logger.log("Scaling down") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = self._decide_next_cap( cap, progress_error, self.heuristic_options) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _scale_up(self, progress_error, instances): """ Checks if it is necessary to scale up, according to the progress_error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If the error is negative and its absolute value is too high, scale up if progress_error < 0 and abs(progress_error) >= self.trigger_up: self.logger.log("Scaling up") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = self._decide_next_cap( cap, progress_error, self.heuristic_options) self.logger.log("Scaling from %f to %f" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _get_progress_error(self, application_id): progress_error_measurement = self.metric_source.get_most_recent_value( Proportional_Alarm.ERROR_METRIC_NAME, {"application_id": application_id}) progress_error_timestamp = progress_error_measurement[0] progress_error = progress_error_measurement[1] progress_error = round(progress_error, self.metric_rounding) return progress_error_timestamp, progress_error def _check_measurements_are_new(self, progress_error_timestamp): return self.last_progress_error_timestamp < progress_error_timestamp def _decide_next_cap(self, current_cap, progress_error, heuristic_options): heuristic = heuristic_options["heuristic_name"] if heuristic == "error_proportional": return self._error_proportional(current_cap, progress_error, heuristic_options) elif heuristic == "error_proportional_up_down": return self._error_proportional_up_down(current_cap, progress_error, heuristic_options) else: raise Exception("Unknown heuristic") def _error_proportional(self, current_cap, progress_error, heuristic_options): """ Calculates the new cap value using a proportional algorithm, with single control parameter. The new cap expression is: new cap = old cap + conservative_factor*progress_error """ proportional_factor = heuristic_options["proportional_factor"] actuation_size = abs(progress_error * proportional_factor) if progress_error < 0: return min(current_cap + actuation_size, self.max_cap) else: return max(current_cap - actuation_size, self.min_cap) def _error_proportional_up_down(self, current_cap, progress_error, heuristic_options): """ Calculates the new cap value using a proportional algorithm, with adjust parameters for scaling down and up. The new cap expression is: new cap = old cap + factor*progress_error """ if progress_error < 0: factor = heuristic_options["factor_up"] actuation_size = abs(progress_error * factor) return min(current_cap + actuation_size, self.max_cap) else: factor = heuristic_options["factor_down"] actuation_size = abs(progress_error * factor) return max(current_cap - actuation_size, self.min_cap) def status(self): return self.last_action
class KubeJobs: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, application_id): # TODO: Check parameters self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.actuation_size = actuation_size self.application_id = application_id self.logger = ScalingLog("%s.generic.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % ( application_id), "cap.log", application_id) self.last_progress_error_timestamp = datetime.datetime.strptime("0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ # TODO: Check parameters try: self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp progress_error_timestamp, progress_error = self._get_progress_error( self.application_id) self.logger.log( "Progress error-[%s]-%f" % (str(progress_error_timestamp), progress_error)) self.last_action = "Progress error-[%s]-%f" % ( str(progress_error_timestamp), progress_error) # Check if the metric is new by comparing the timestamps of the current metric and most recent metric if self._check_measurements_are_new(progress_error_timestamp): self._scale_down(progress_error) self._scale_up(progress_error) if self.cap != -1: self.cap_logger.log("%.0f|%s|%s" % (time.time(), str(self.application_id), str(self.cap))) self.last_progress_error_timestamp = progress_error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") except Exception as e: # TODO: Check exception type self.logger.log(str(e)) raise e def _scale_down(self, progress_error): """ Checks if it is necessary to scale down, according to the progress_error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If the error is positive and its absolute value is too high, scale down if progress_error > 0 and progress_error >= self.trigger_down: self.logger.log("Scaling down") self.last_action = "Getting allocated resources" # Get current CPU cap replicas = self.actuator.get_number_of_replicas() new_replicas = max(replicas - self.actuation_size, self.min_cap) # new_cap = max(cap - self.actuation_size, self.min_cap) self.logger.log("Scaling from %d to %d" % (replicas, new_replicas)) self.last_action = "Scaling from %d to %d" % (replicas, new_replicas) # Currently, we use the same cap for all the vms # cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(new_replicas) def _scale_up(self, progress_error): """ Checks if it is necessary to scale up, according to the progress_error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If the error is negative and its absolute value is too high, scale up if progress_error < 0 and abs(progress_error) >= self.trigger_up: self.logger.log("Scaling up") self.last_action = "Getting allocated resources" # Get current number of replicas replicas = self.actuator.get_number_of_replicas() new_replicas = min(replicas + self.actuation_size, self.max_cap) self.logger.log("Scaling from %d to %d" % (replicas, new_replicas)) self.last_action = "Scaling from %d to %d" % (replicas, new_replicas) # Currently, we use the same cap for all the vms # cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(new_replicas) def _get_progress_error(self, application_id): progress_error_measurement = self.metric_source.get_most_recent_value(application_id) progress_error_timestamp = progress_error_measurement[0] progress_error = progress_error_measurement[1] return progress_error_timestamp, progress_error def _check_measurements_are_new(self, progress_error_timestamp): return self.last_progress_error_timestamp < progress_error_timestamp def status(self): return self.last_action
class PIDAlarm: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, metric_rounding, heuristic_options, application_id, instances): self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.metric_rounding = metric_rounding self.heuristic_options = heuristic_options self.application_id = application_id self.instances = instances self.integrated_error = 0 self.logger = ScalingLog("%s.pid.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % ( application_id), "cap.log", application_id) self.last_error = "" self.last_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ self.last_action = "getting progress error" # Get the progress error value and timestamp error_timestamp, error = self._get_error( self.application_id) self.last_action = "Progress error-[%s]-%f" % ( str(error_timestamp), error) print self.last_action # Check if the metric is new by comparing the timestamps # of the current metric and most recent metric if self._check_measurements_are_new(error_timestamp): self._scale(error, self.instances) if self.cap != -1: print("%.0f|%s|%s" % ( time.time(), str(self.application_id), str(self.cap))) self.last_error = error self.last_error_timestamp = error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") def _scale(self, error, instances): """ Checks if it is necessary to scale, according to the error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ self.logger.log("Scaling down") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = self._decide_next_cap( cap, error, self.heuristic_options) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _get_error(self, application_id): error_measurement = self.metric_source.get_most_recent_value( PIDAlarm.ERROR_METRIC_NAME, {"application_id": application_id} ) error_timestamp = error_measurement[0] error = error_measurement[1] error = round(error, self.metric_rounding) return error_timestamp, error def _check_measurements_are_new(self, error_timestamp): return self.last_error_timestamp < error_timestamp def _decide_next_cap(self, current_cap, error, heuristic_options): heuristic = heuristic_options["heuristic_name"] if heuristic == "error_pid": return self._error_pid(current_cap, error, heuristic_options) else: raise Exception("Unknown heuristic") def _error_pid(self, current_cap, error, heuristic_options): """ Calculates the new cap value using a PID algorithm. The new cap expression is: new cap = old cap - proportional_factor * error - derivative_factor * (error difference) - integrative_factor * (integrated_error) """ proportional_factor = heuristic_options["proportional_factor"] derivative_factor = heuristic_options["derivative_factor"] integrative_factor = heuristic_options["integrative_factor"] proportional_component = -1 * error * proportional_factor """ If it is the first call, there is no last_error and the derivative component value is null """ if self.last_error == "": derivative_component = 0 else: derivative_component = -1 * derivative_factor * \ (error - self.last_error) self.integrated_error += error integrative_component = -1 * self.integrated_error * integrative_factor new_cap = current_cap + proportional_component + \ derivative_component + integrative_component new_cap = max(min(new_cap, self.max_cap), self.min_cap) return new_cap
class GenericAlarm: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, metric_rounding, application_id, instances): self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.metric_rounding = metric_rounding self.application_id = application_id self.instances = instances self.load_balancer_url = api.load_balancer_url self.logger = ScalingLog("%s.generic.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % (application_id), "cap.log", application_id) self.last_error = "" self.last_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ try: self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp error_timestamp, error = self._get_error(self.application_id) self.last_action = "Progress error-[%s]-%f" % ( str(error_timestamp), error) self.logger.log(self.last_action) """ Check if the metric is new by comparing the timestamps of the current metric and most recent metric """ if self._check_measurements_are_new(error_timestamp): print "TO NO ALARME" self._scale_down(error, self.instances) self._scale_up(error, self.instances) self.last_error = error self.last_error_timestamp = error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") except Exception as e: # TODO: Check exception type self.logger.log(str(e)) return def _scale_down(self, error, instances): """ Checks if it is necessary to scale down, according to the error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If error is positive and its absolute value is too high, scale down if error > 0: print "TO NO SCALE DOWN" self.logger.log("Scaling down") self.last_action = "Getting allocated resources" if error < 0.1: requests.post(self.load_balancer_url + "/down", json={"vm_number": 1}) elif error < 0.2: requests.post(self.load_balancer_url + "/down", json={"vm_number": 2}) else: requests.post(self.load_balancer_url + "/down", json={"vm_number": 3}) def _scale_up(self, error, instances): """ Checks if it is necessary to scale up, according to the error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If error is negative and its absolute value is too high, scale up if error < 0: print "TO NO SCALE UP" self.logger.log("Scaling up") self.last_action = "Getting allocated resources" if error > -0.1: requests.post(self.load_balancer_url + "/up", json={"vm_number": 1}) elif error > -0.2: requests.post(self.load_balancer_url + "/up",json={"vm_number": 2}) else: requests.post(self.load_balancer_url+ "/up",json={"vm_number": 3}) def _get_error(self, application_id): error_measurement = self.metric_source.get_most_recent_value( GenericAlarm.ERROR_METRIC_NAME, {"application_id": application_id}) error_timestamp = error_measurement[0] error = error_measurement[1] error = round(error, self.metric_rounding) return error_timestamp, error def _check_measurements_are_new(self, error_timestamp): return self.last_error_timestamp < error_timestamp def status(self): return self.last_action
class GenericAlarm: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, trigger_down, trigger_up, min_cap, max_cap, actuation_size, metric_rounding, application_id, instances): self.metric_source = metric_source self.actuator = actuator self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_cap = min_cap self.max_cap = max_cap self.metric_rounding = metric_rounding self.actuation_size = actuation_size self.application_id = application_id self.instances = instances self.logger = ScalingLog("%s.generic.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % (application_id), "cap.log", application_id) self.last_error = "" self.last_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ try: self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp error_timestamp, error = self._get_error(self.application_id) self.last_action = "Progress error-[%s]-%f" % ( str(error_timestamp), error) self.logger.log(self.last_action) """ Check if the metric is new by comparing the timestamps of the current metric and most recent metric """ if self._check_measurements_are_new(error_timestamp): self._scale_down(error, self.instances) self._scale_up(error, self.instances) if self.cap != -1: self.cap_logger.log( "%.0f|%s|%s" % (time.time(), str(self.application_id), str(self.cap))) self.last_error = error self.last_error_timestamp = error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") except Exception as e: # TODO: Check exception type self.logger.log(str(e)) return def _scale_down(self, error, instances): """ Checks if it is necessary to scale down, according to the error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If error is positive and its absolute value is too high, scale down if error > 0 and error >= self.trigger_down: self.logger.log("Scaling down") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = max(cap - self.actuation_size, self.min_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _scale_up(self, error, instances): """ Checks if it is necessary to scale up, according to the error. If it is, calculates the new CPU cap value and tries to modify the cap of the vms. """ # If error is negative and its absolute value is too high, scale up if error < 0 and abs(error) >= self.trigger_up: self.logger.log("Scaling up") self.last_action = "Getting allocated resources" # Get current CPU cap cap = self.actuator.get_allocated_resources_to_cluster(instances) new_cap = min(cap + self.actuation_size, self.max_cap) self.logger.log("Scaling from %d to %d" % (cap, new_cap)) self.last_action = "Scaling from %d to %d" % (cap, new_cap) # Currently, we use the same cap for all the vms cap_instances = {instance: new_cap for instance in instances} # Set the new cap self.actuator.adjust_resources(cap_instances) self.cap = new_cap def _get_error(self, application_id): error_measurement = self.metric_source.get_most_recent_value( GenericAlarm.ERROR_METRIC_NAME, {"application_id": application_id}) error_timestamp = error_measurement[0] error = error_measurement[1] error = round(error, self.metric_rounding) return error_timestamp, error def _check_measurements_are_new(self, error_timestamp): return self.last_error_timestamp < error_timestamp def status(self): return self.last_action
class KubeJobs: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, data): # TODO: Check parameters scaling_parameters = data["control_parameters"] self.metric_source = MetricSourceBuilder().\ get_metric_source(scaling_parameters.get('metric_source'), data) self.app_id = data.get('app_id') scaling_parameters.update({'app_id': self.app_id}) self.scheduler = self.setup_scheduler(scaling_parameters) self.actuator = self.setup_actuator(scaling_parameters) self.logger = ScalingLog("%s.generic.alarm.log" % (self.app_id), "controller.log", self.app_id) self.cap_logger = ScalingLog("%s.cap.log" % (self.app_id), "cap.log", self.app_id) self.last_progress_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def setup_scheduler(self, parameters): strategy = parameters.get('schedule_strategy') if strategy == "default": return DefaultScheduler(parameters) elif strategy == "pid": return PidScheduler(parameters) def setup_actuator(self, parameters): actuator = parameters.get('actuator') return ActuatorBuilder().get_actuator(actuator, parameters) def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ # TODO: Check parameters progress_error_timestamp, progress_error = \ self._get_progress_error(self.app_id) self.last_action = "Progress error-[%s]-%f" % \ (str(progress_error_timestamp), progress_error) if self._check_measurements_are_new(progress_error_timestamp): self._scale(progress_error) if self.cap != -1: self.cap_logger.log("%.0f|%s|%s" % (time.time(), str(self.app_id), str(self.cap))) self.last_progress_error_timestamp = progress_error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log(self.last_action) def _get_progress_error(self, app_id): self.last_action = "Getting progress error" self.logger.log(self.last_action) progress_error_measurement = self.metric_source.get_most_recent_value( app_id) progress_error_timestamp = progress_error_measurement[0] progress_error = progress_error_measurement[1] return progress_error_timestamp, progress_error def _scale(self, progress_error): last_replicas = self.actuator.get_number_of_replicas() info = {'last_replicas': last_replicas, 'progress_error': progress_error} new_resource_allocation = self.scheduler.scale(info) if new_resource_allocation is not None: self.logger.log("Scaling from %d to %d" % (last_replicas, new_resource_allocation)) self.actuator.adjust_resources(new_resource_allocation) def _check_measurements_are_new(self, progress_error_timestamp): return self.last_progress_error_timestamp < progress_error_timestamp def status(self): return self.last_action
class Vertical: ERROR_METRIC_NAME = "application-progress.error" def __init__(self, actuator, metric_source, actuator_metric, trigger_down, trigger_up, min_quota, max_quota, application_id): # TODO: Check parameters self.metric_source = metric_source self.actuator = actuator self.actuator_metric = actuator_metric self.trigger_down = trigger_down self.trigger_up = trigger_up self.min_quota = min_quota self.max_quota = max_quota self.application_id = application_id self.logger = ScalingLog("%s.vertical.alarm.log" % (application_id), "controller.log", application_id) self.cap_logger = ScalingLog("%s.cap.log" % (application_id), "cap.log", application_id) self.last_progress_error_timestamp = datetime.datetime.strptime( "0001-01-01T00:00:00.0Z", '%Y-%m-%dT%H:%M:%S.%fZ') self.last_action = "" self.cap = -1 def check_application_state(self): """ Checks the application progress by getting progress metrics from a metric source, checks if the metrics are new and tries to modify the amount of allocated resources if necessary. """ # TODO: Check parameters try: self.logger.log("Getting progress error") self.last_action = "getting progress error" # Get the progress error value and timestamp progress_error_timestamp, progress_error = self._get_progress_error( self.application_id) self.logger.log("Progress error-[%s]-%f" % (str(progress_error_timestamp), progress_error)) self.last_action = "Progress error-[%s]-%f" % ( str(progress_error_timestamp), progress_error) # Check if the metric is new by comparing the timestamps of the current metric and most recent metric if self._check_measurements_are_new(progress_error_timestamp): self._scale_down(progress_error) self._scale_up(progress_error) if self.cap != -1: self.cap_logger.log( "%.0f|%s|%s" % (time.time(), str(self.application_id), str(self.cap))) self.last_progress_error_timestamp = progress_error_timestamp else: self.last_action += " Could not acquire more recent metrics" self.logger.log("Could not acquire more recent metrics") except Exception as e: # TODO: Check exception type self.logger.log(str(e)) raise e def _scale_down(self, progress_error): """Scales down the specific resource using an external API. Arguments: progress_error {float} -- progress error of the job """ # If the error is positive and its absolute value is too high, scale down if progress_error > 0 and progress_error >= self.trigger_down: if self.actuator_metric == 'cpu': self.logger.log("Scaling down") self.last_action = "Getting allocated resources" self.logger.log( "Scaling %s quota from %d / %d" % (self.actuator_metric, self.max_quota, self.max_quota)) print("Scaling %s from %d / %d" % (self.actuator_metric, self.max_quota, self.max_quota)) self.set_cpu_quota(self.max_quota) def _scale_up(self, progress_error): """Scales up the specific resource using an external API. Arguments: progress_error {float} -- progress error of the job """ # If the error is negative and its absolute value is too high, scale up if progress_error < 0 and abs(progress_error) >= self.trigger_up: if self.actuator_metric == 'cpu': self.logger.log("Scaling up") self.last_action = "Getting allocated resources" self.logger.log("Scaling from %d / %d" % (self.min_quota, self.max_quota)) print("Scaling from %d / %d" % (self.min_quota, self.max_quota)) self.set_cpu_quota(self.min_quota) def _get_progress_error(self, application_id): """Gets the progress error of the job Arguments: application_id {string} -- The application identifier Returns: [tuple] -- Returns a tuple containing the progress error timestamp and the current value of the progress error """ progress_error_measurement = self.metric_source.get_most_recent_value( application_id) progress_error_timestamp = progress_error_measurement[0] progress_error = progress_error_measurement[1] return progress_error_timestamp, progress_error def _check_measurements_are_new(self, progress_error_timestamp): """Check if the currently measurements where already computed. Arguments: progress_error_timestamp {string} -- Timestamp of the current progress error Returns: [boolean] -- 'true' if the measurements are new, 'false' otherwise """ return self.last_progress_error_timestamp < progress_error_timestamp def status(self): return self.last_action def set_cpu_quota(self, new_cpu_quota): """Sets the CPU quota of the physical machine using a external API Arguments: new_cpu_quota {int} -- The new value for the CPU quota of the machine """ try: r = requests.post('http://%s:5000' % (self.actuator.api_address), data='{\"cpu_quota\":\"' + str(new_cpu_quota) + '\"}') except Exception as ex: print("Error while modifying cpu quota") print ex.message raise