def update_cluster_config(self): """ Upgrade the cluster config in S3 such that it has all required fields. """ logger.info("Updating cluster config!") cluster_config = AXClusterConfig(cluster_name_id=self._cluster_name_id, aws_profile=self._profile) cluster_info = AXClusterInfo(cluster_name_id=self._cluster_name_id, aws_profile=self._profile) # Separate axsys / axuser config if needed update_node_config_key_needed = False try: # New cluster config is looking for "max_node_count" for this method and # should throw KeyError if the cluster config in s3 was the old one cluster_config.get_max_node_count() except KeyError: update_node_config_key_needed = True if update_node_config_key_needed: logger.info("Updating node config keys ...") # Parse old raw config directly minion_type = cluster_config._conf["cloud"]["configure"][ "minion_type"] max_count = cluster_config._conf["cloud"]["configure"]["max_count"] min_count = cluster_config._conf["cloud"]["configure"]["min_count"] axsys_count = cluster_config._conf["cloud"]["configure"][ "axsys_nodes"] # Remove all old keys for old_key in [ "minion_type", "max_count", "min_count", "axsys_nodes" ]: cluster_config._conf["cloud"]["configure"].pop(old_key, None) # Setting new keys cluster_config._conf["cloud"]["configure"][ "axsys_node_count"] = axsys_count cluster_config._conf["cloud"]["configure"][ "max_node_count"] = max_count cluster_config._conf["cloud"]["configure"][ "min_node_count"] = min_count # All clusters that needs this upgrade has same node type for axsys and axuser cluster_config._conf["cloud"]["configure"][ "axuser_node_type"] = minion_type cluster_config._conf["cloud"]["configure"][ "axsys_node_type"] = minion_type else: logger.info("Node config keys are already up-to-date") # If cluster type is not set, default it to standard type if cluster_config.get_ax_cluster_type() == None: cluster_config._conf["cloud"]["configure"][ "cluster_type"] = AXClusterType.STANDARD # Check and update Cluster user. Defaults to "customer" if cluster_config.get_ax_cluster_user() is None: cluster_config.set_ax_cluster_user('customer') # Check and update Cluster size. Defaults to "small" if cluster_config.get_ax_cluster_size() is None: max_count = cluster_config.get_max_node_count() if max_count == 5: cluster_size = "small" elif max_count == 10: cluster_size = "medium" elif max_count == 21: cluster_size = "large" elif max_count == 30: cluster_size = "xlarge" else: cluster_size = "small" cluster_config.set_ax_cluster_size(cluster_size) # Check and update AX Volume size. Note that this has to come *AFTER* the cluster_size is set. if cluster_config.get_ax_vol_size() is None: cluster_size = cluster_config.get_ax_cluster_size() if cluster_size in ("small", "medium"): vol_size = 100 elif cluster_size == "large": vol_size = 200 elif cluster_size == "xlarge": vol_size = 400 else: vol_size = 100 cluster_config.set_ax_vol_size(vol_size) # Ensure that we have 3 tiers now cluster_config.set_node_tiers("master/applatix/user") # set new ami id ami_name = os.getenv("AX_AWS_IMAGE_NAME") ami_id = AMI( aws_region=self._region, aws_profile=self._profile).get_ami_id_from_name(ami_name=ami_name) logger.info("Updating cluster config with ami %s", ami_id) cluster_config.set_ami_id(ami_id) cluster_config.save_config()
class AXSYSKubeYamlUpdater(object): """ This class loads a kubernetes yaml file, updates resource, and generate objects that kube_object.py can consume """ def __init__(self, config_file_path): assert os.path.isfile( config_file_path), "Config file {} is not a file".format( config_file_path) self._config_file = config_file_path self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id) self.cpu_mult, self.mem_mult, self.disk_mult, \ self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers() self._swagger_components = [] self._yaml_components = [] self._updated_raw = "" # TODO: when we support config software info using a config file, need to figure out how that # file gets passed through, since SoftwareInfo is not a singleton self._software_info = SoftwareInfo() self._load_objects() self._load_raw() @property def updated_raw(self): return self._updated_raw @property def components_in_dict(self): return self._yaml_components @property def components_in_swagger(self): return self._swagger_components def _load_objects(self): with open(self._config_file, "r") as f: data = f.read() for c in yaml.load_all(data): swagger_obj = self._config_yaml(c) yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj) self._swagger_components.append(swagger_obj) self._yaml_components.append(yaml_obj) def _load_raw(self): self._updated_raw = yaml.dump_all(self._yaml_components) def _get_resource_multipliers(self): """ Resources in yaml templates need to be multiplied with these numbers :return: cpu_multiplier, mem_multiplier, disk_multiplier """ # Getting cluster size from cluster config, in order to configure resources # There are 3 situations we will be using AXClusterConfig # - During install, since the class is a singleton, it has all the values we need # no need to download from s3 # - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download # cluster config files from s3 to get the values # - During job creation: the node axmon runs has the proper roles to access s3 try: ax_node_max = int(self._cluster_config.get_asxys_node_count()) ax_node_type = self._cluster_config.get_axsys_node_type() usr_node_max = int( self._cluster_config.get_max_node_count()) - ax_node_max usr_node_type = self._cluster_config.get_axuser_node_type() assert all( [ax_node_max, ax_node_type, usr_node_max, usr_node_type]) except Exception as e: logger.error( "Unable to read cluster config, skip resource config for %s. Error %s", self._config_file, e) return 1, 1, 1, 1, 1 rc = AXSYSResourceConfig( ax_node_type=ax_node_type, ax_node_max=ax_node_max, usr_node_type=usr_node_type, usr_node_max=usr_node_max, cluster_type=self._cluster_config.get_ax_cluster_type()) #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)", # ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file, # rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, # rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier) return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier def _config_yaml(self, kube_yaml_obj): """ Load dict into swagger object, patch resource, sanitize, return a dict :param kube_yaml_obj: :return: swagger object with resource values finalized """ kube_kind = kube_yaml_obj["kind"] (swagger_class_literal, swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind] swagger_obj = ApiClient()._ApiClient__deserialize( kube_yaml_obj, swagger_class_literal) assert isinstance(swagger_obj, swagger_instance), \ "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance) if isinstance(swagger_obj, V1beta1Deployment): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None node_selector = swagger_obj.spec.template.spec.node_selector if node_selector.get('ax.tier', 'applatix') == 'master': # Skip updating containers on master. logger.info( "Skip updating cpu, mem multipliers for pods on master: %s", swagger_obj.metadata.name) else: for container in swagger_obj.spec.template.spec.containers: self._update_container(container) return swagger_obj elif isinstance(swagger_obj, V1Pod): if not self._software_info.registry_is_private(): swagger_obj.spec.image_pull_secrets = None return swagger_obj elif isinstance(swagger_obj, V1beta1DaemonSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None for container in swagger_obj.spec.template.spec.containers: # We are special-casing applet DaemonSet to compromise the fact that # we are using different node type for compute-intense nodes if swagger_obj.metadata.name == "applet": self._update_container(container=container, is_daemon=True, update_resource=True) else: self._update_container(container=container, is_daemon=True, update_resource=False) return swagger_obj elif isinstance(swagger_obj, V1beta1StatefulSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None return self._update_statefulset(swagger_obj) elif isinstance(swagger_obj, V1PersistentVolumeClaim): self._update_volume(swagger_obj) return swagger_obj else: # logger.info("Object %s does not need to configure resource", type(swagger_obj)) # HACK, as the original hook will be messed up if isinstance(swagger_obj, V1Service): if swagger_obj.metadata.name == "axops": swagger_obj.spec.load_balancer_source_ranges = [] for cidr in self._cluster_config.get_trusted_cidr(): # Seems swagger client does not support unicode ... SIGH swagger_obj.spec.load_balancer_source_ranges.append( str(cidr)) # HACK #2: if we don't do this, kubectl will complain about something such as # # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z) # # p.target_port is defined as string though, but if its really a string, kubectl # is looking for a port name, rather than a number # SIGH ... for p in swagger_obj.spec.ports or []: try: p.target_port = int(p.target_port) except (ValueError, TypeError): pass return swagger_obj def _update_deployment_or_daemonset(self, kube_obj): assert isinstance(kube_obj, V1beta1Deployment) or isinstance( kube_obj, V1beta1DaemonSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) return kube_obj def _update_statefulset(self, kube_obj): assert isinstance(kube_obj, V1beta1StatefulSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) if isinstance(kube_obj.spec.volume_claim_templates, list): for vol in kube_obj.spec.volume_claim_templates: self._update_volume(vol) return kube_obj def _update_container(self, container, is_daemon=False, update_resource=True): assert isinstance(container, V1Container) if update_resource: cpulim = container.resources.limits.get("cpu") memlim = container.resources.limits.get("memory") cpureq = container.resources.requests.get("cpu") memreq = container.resources.requests.get("memory") def _massage_cpu(orig): return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult def _massage_mem(orig): return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult if cpulim: rvc = ResourceValueConverter(value=cpulim, target="cpu") rvc.massage(_massage_cpu) container.resources.limits["cpu"] = "{}m".format( rvc.convert("m")) if cpureq: rvc = ResourceValueConverter(value=cpureq, target="cpu") rvc.massage(_massage_cpu) container.resources.requests["cpu"] = "{}m".format( rvc.convert("m")) if memlim: rvc = ResourceValueConverter(value=memlim, target="memory") rvc.massage(_massage_mem) container.resources.limits["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if memreq: rvc = ResourceValueConverter(value=memreq, target="memory") rvc.massage(_massage_mem) container.resources.requests["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if container.liveness_probe and container.liveness_probe.http_get: try: container.liveness_probe.http_get.port = int( container.liveness_probe.http_get.port) except (ValueError, TypeError): pass if container.readiness_probe and container.readiness_probe.http_get: try: container.readiness_probe.http_get.port = int( container.readiness_probe.http_get.port) except (ValueError, TypeError): pass # Add resource multiplier to containers in case we need them if not container.env: container.env = [] container.env += self._generate_default_envs(is_daemon, update_resource) def _update_volume(self, vol): assert isinstance(vol, V1PersistentVolumeClaim) vol_size = vol.spec.resources.requests["storage"] def _massage_disk(orig): return orig * self.disk_mult if vol_size: rvc = ResourceValueConverter(value=vol_size, target="storage") rvc.massage(_massage_disk) # Since AWS does not support value such as 1.5G, lets round up to its ceil vol.spec.resources.requests["storage"] = "{}Gi".format( int(ceil(rvc.convert("Gi")))) # Manually patch access mode as swagger client mistakenly interprets this as map vol.spec.access_modes = ["ReadWriteOnce"] def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src else: var.value = d["value"] rst.append(var) return rst