def _set_autoscaling(self): # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg() or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException("Failed to get autoscaling group for cluster {}".format(self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if asg_name is not None: self._replacing["ASG_NAME"] = asg_name else: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group")
def asgs_to_option(self, asgs): """ Returns the config option based on the names of the ASGs. """ asg_manager = AXUserASGManager(self._cluster_name_id, self._region) all_asg_names = asg_manager.get_all_asg_names() if asgs is None or len(asgs) == 0: return SpotInstanceOption.NO_SPOT elif set(asgs) == set(all_asg_names): return SpotInstanceOption.ALL_SPOT else: return SpotInstanceOption.PARTIAL_SPOT return
def option_to_asgs(self, option): """ Returns the names of the ASGs based on the provided config option. """ assert option in SpotInstanceOption.VALID_SPOT_INSTANCE_OPTIONS, \ "{} is not a valid spot instance option".format(option) asg_manager = AXUserASGManager(self._cluster_name_id, self._region) if option == SpotInstanceOption.ALL_SPOT: asg_names = asg_manager.get_all_asg_names() return asg_names elif option == SpotInstanceOption.NO_SPOT: return [] else: return [asg_manager.get_variable_asg()["AutoScalingGroupName"]] return
def put_spot_instance_config(): if AXClusterConfig().get_cluster_provider().is_user_cluster(): raise AXIllegalOperationException("Spot instances not allowed on user provided K8S clusters.") (data,) = _get_optional_arguments('enabled') if isinstance(data, bool): enabled_str = str(data) elif isinstance(data, string_types): enabled_str = "True" if data.lower() == "true" else "False" else: raise ValueError("enabled must be string or boolean") payload = {'enabled': enabled_str} asg_manager = AXUserASGManager(cluster_name_id, AXClusterConfig().get_region()) # Get "spot_instances_option" option (option,) = _get_optional_arguments('spot_instances_option') if option is not None: spotOptionMgr = SpotInstanceOptionManager( cluster_name_id, AXClusterConfig().get_region()) asg_names = spotOptionMgr.option_to_asgs(option) asg_option = " ".join(asg_names) if option == SpotInstanceOption.NO_SPOT: _app.logger.info("ASGS passed in a \"none\". Disabling minion-manager.") enabled_str = "False" payload['enabled'] = enabled_str payload['asgs'] = asg_option response = requests.put(MINION_MANAGER_HOSTNAME + ":" + MINION_MANAGER_PORT + "/spot_instance_config", params=payload) response.raise_for_status() _app.logger.info("Change in Spot instance config: {}".format(enabled_str)) return jsonify({"status": "ok"})
def modify_asg(self, min, max): logger.info("Modifying autoscaling group ...") asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() if not asg: raise AXPlatformException( "Failed to get variable autoscaling group for cluster {}". format(self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] try: asg_manager.set_asg_spec(name=asg_name, minsize=1, maxsize=max) except ClientError as ce: raise AXPlatformException( "Failed to set cluster's variable autoscaling group min/max. Error: {}" .format(ce)) logger.info("Modifying cluster autoscaling group ... DONE")
def _recover_auto_scaling_groups(self): """ This steps does the following: - fetch the previously restored auto scaling group config. If this config cannot be found, we can assume that all autoscaling groups have correct configurations. This could happen when previous restart failed in the middle but passed this stage already, or the cluster is not even paused - Wait for all instances to be in service :return: """ # Get previously persisted asg status logger.info("Fetching last cluster status ...") cluster_status_raw = self._cluster_info.download_cluster_status_before_pause( ) asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) if cluster_status_raw: logger.info("Found last cluster status, restoring cluster ...") cluster_status = yaml.load(cluster_status_raw) all_asg_statuses = cluster_status["asg_status"] # Restore minions for asg_name in all_asg_statuses.keys(): asg_status = all_asg_statuses[asg_name] min_size = asg_status["min_size"] max_size = asg_status["max_size"] desired = asg_status["desired_capacity"] self._total_nodes += desired logger.info( "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s", asg_name, min_size, max_size, desired) asg_mgr.set_asg_spec(name=asg_name, minsize=min_size, maxsize=max_size, desired=desired) logger.info("Waiting for all auto scaling groups to scale up ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster instances are in service%s", COLOR_GREEN, COLOR_NORM) # Delete previously stored cluster status self._cluster_info.delete_cluster_status_before_pause() else: all_asgs = asg_mgr.get_all_asgs() for asg in all_asgs: self._total_nodes += asg["DesiredCapacity"] logger.info( "Cannot find last cluster status, cluster already resumed with %s nodes", self._total_nodes)
def _scale_down_auto_scaling_groups(self): """ This step: - Persist autoscaling group states to S3, - Scale down all autoscaling groups to zero, - Wait for all minion to be terminated :return: """ logger.info("Discovering autoscaling groups") asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) all_asgs = asg_mgr.get_all_asgs() # Generate cluster status before pause. This is used to recover same amount of nodes # when we want to restart cluster cluster_status = {"asg_status": {}} for asg in all_asgs: cluster_status["asg_status"][asg["AutoScalingGroupName"]] = { "min_size": asg["MinSize"], "max_size": asg["MaxSize"], "desired_capacity": asg["DesiredCapacity"] } self._cluster_info.upload_cluster_status_before_pause( status=yaml.dump(cluster_status)) # Scale down asg logger.info("Scaling down autoscaling groups ...") for asg in all_asgs: asg_name = asg["AutoScalingGroupName"] asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0) # Waiting for nodes to be terminated logger.info("Waiting for all auto scaling groups to scale down ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN, COLOR_NORM)
def _generate_replacing(self): # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid # yaml files, and therefore we construct string manually here trusted_cidr = self._cluster_config.get_trusted_cidr() if isinstance(trusted_cidr, list): trusted_cidr_str = "[" for cidr in trusted_cidr: trusted_cidr_str += "\"{}\",".format(str(cidr)) trusted_cidr_str = trusted_cidr_str[:-1] trusted_cidr_str += "]" else: trusted_cidr_str = "[{}]".format(trusted_cidr) axsys_cpu = 0 axsys_mem = 0 daemon_cpu = 0 daemon_mem = 0 for name in self._kube_objects.keys(): cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage axsys_cpu += cpu axsys_mem += mem daemon_cpu += dcpu daemon_mem += dmem # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not # have a memory request, but this is an approximation) daemon_cpu += 100 daemon_mem += 100 logger.info( "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi", axsys_cpu, axsys_mem, daemon_cpu, daemon_mem) axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = str( int(self._cluster_config.get_min_node_count()) - axsys_node_count) axuser_max_count = str( int(self._cluster_config.get_max_node_count()) - axsys_node_count) autoscaler_scan_interval = str( self._cluster_config.get_autoscaler_scan_interval()) usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["cpu"] usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["memory"] scale_down_util_thresh = round( max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001 logger.info("Setting node scale down utilization threshold to %s", scale_down_util_thresh) self._persist_node_resource_rsvp(daemon_cpu, daemon_mem) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if not asg_name: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # Prepare minion-manager. spot_instances_option = self._cluster_config.get_spot_instances_option( ) minion_manager_asgs = "" if spot_instances_option == SpotInstanceOption.ALL_SPOT: for asg in asg_manager.get_all_asgs(): minion_manager_asgs = minion_manager_asgs + asg[ "AutoScalingGroupName"] + " " minion_manager_asgs = minion_manager_asgs[:-1] elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT: minion_manager_asgs = asg_manager.get_variable_asg( )["AutoScalingGroupName"] return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "ASG_MIN": axuser_min_count, "ASG_MAX": axuser_max_count, "AUTOSCALER_SCAN_INTERVAL": autoscaler_scan_interval, "SCALE_DOWN_UTIL_THRESH": str(scale_down_util_thresh), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key( key=self._cluster_config_path.cluster_metadata()), "ASG_NAME": asg_name, "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "AX_ENABLE_SPOT_INSTANCES": str(spot_instances_option != SpotInstanceOption.NO_SPOT), "AX_SPOT_INSTANCE_ASGS": minion_manager_asgs, }
def ax_asg_helper(self): self.mock_setup() return AXUserASGManager(self.cluster_name_id, 'us-west-2')
from werkzeug.exceptions import BadRequest _app = Flask("AXmon") axmon = None # Rlock for counting the max concurrent requests concurrent_reqs_lock = RLock() concurrent_reqs = 0 MAX_CONCURRENT_REQS = 100 MINION_MANAGER_HOSTNAME = "http://minion-manager.kube-system" MINION_MANAGER_PORT = "6000" kubectl = KubernetesApiClient(use_proxy=True) cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) asg_manager = AXUserASGManager(os.getenv("AX_CLUSTER_NAME_ID"), AXClusterConfig().get_region()) # Need a lock to serialize cluster config operation cfg_lock = RLock() axmon_api_latency_stats = Summary("axmon_api_latency", "Latency for axmon REST APIs", ["method", "endpoint", "status"]) axmon_api_concurrent_reqs = Gauge("axmon_api_concurrent_reqs", "Concurrent requests in axmon") def before_request(): request.start_time = time.time() global concurrent_reqs, MAX_CONCURRENT_REQS, concurrent_reqs_lock with concurrent_reqs_lock: axmon_api_concurrent_reqs.set(concurrent_reqs) # Disabling concurrent request logic for now due to findings in AA-3167