def ensure_master_tags(self): """ During upgrade, we need to ensure master has AXClusterNameID,AXCustomerID,AXTier tags (#23) :return: True if we updated master tags """ for tag in self.attributes['master_tags']: if tag["Key"] == "AXTier": # Master has updated tags return False self.attributes['master_tags'] += [ { "Key": "AXCustomerID", "Value": AXCustomerId().get_customer_id() }, { "Key": "AXTier", "Value": "master" }, { "Key": "AXClusterNameID", "Value": self.cluster_name_id }, ] return True
def _update_version(self): # Software info we get during install / upgrade does not contain ami id # need to persist it as well self._software_info.ami_id = self._cluster_config.get_ami_id() AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id, self._aws_profile).update(self._software_info.to_dict())
def _generate_replacing_for_user_provisioned_cluster(self): trusted_cidr_str = self._get_trusted_cidr_str() self._persist_node_resource_rsvp(0, 0) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key(key=self._cluster_config_path.cluster_metadata()), "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "ARGO_DATA_BUCKET_NAME": AXClusterConfigPath(self._cluster_name_id).bucket(), "LOAD_BALANCER_TYPE": "LoadBalancer", "ARGO_S3_ACCESS_KEY_ID": base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_ID", "")), "ARGO_S3_ACCESS_KEY_SECRET": base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_SECRET", "")), }
def cluster_bucket(): bucket = AXS3Bucket(bucket_name="applatix-cluster-{}-0".format( AXCustomerId().get_customer_id()), region=TEST_AWS_REGION, aws_profile=TEST_AWS_PROFILE) bucket.create() yield bucket bucket.delete(force=True)
def _generate_wait_container_spec(self): main_container_name = self.service.template.name c = SidecarTask(main_container_name, self.software_info.image_namespace, self.software_info.image_version) c.add_env("AX_MAIN_CONTAINER_NAME", value=main_container_name) c.add_env("AX_JOB_NAME", value=self.jobname) c.add_env("AX_CUSTOMER_ID", AXCustomerId().get_customer_id()) c.add_env("AX_REGION", AXClusterConfig().get_region()) c.add_env("AX_CLUSTER_NAME_ID", self._name_id) return c
def axmon_api_get_portal(): """ Get portal connection information Returns the portal connection information as a json object """ try: portal = { "cluster_name_id": os.getenv("AX_CLUSTER_NAME_ID"), "customer_id": AXCustomerId().get_customer_id() } return jsonify(portal) except Exception as e: raise AXPlatformException("Critical environment variable missing: {}".format(e))
def __init__(self, containername, customer_image, namespace, version): s = SoftwareInfo() super(ArtifactsContainer, self).__init__( containername, "{}/{}/artifacts:{}".format(s.registry, namespace, version)) # artifacts scratch space self._artifacts_scratch = ContainerVolume( "artifacts-scratch", ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self._artifacts_scratch.set_type("EMPTYDIR") self.add_volume(self._artifacts_scratch) # create a hostpath for docker-socket-dir. This is used to for running docker inspect socket_hostpath = ContainerVolume("docker-socket-file", "/var/run/docker.sock") socket_hostpath.set_type("HOSTPATH", "/var/run/docker.sock") self.add_volume(socket_hostpath) # emptydir for sharing for copying static binaries from init container # so that they are available in the main container self._static_bins = ContainerVolume("static-bins", "/copyto") self._static_bins.set_type("EMPTYDIR") self.add_volume(self._static_bins) # add environment vars needed for artifacts self.add_env("AX_TARGET_CLOUD", value=Cloud().target_cloud()) self.add_env("AX_CLUSTER_NAME_ID", value=AXClusterId().get_cluster_name_id()) self.add_env("AX_CUSTOMER_ID", value=AXCustomerId().get_customer_id()) self.add_env("AX_CUSTOMER_IMAGE_NAME", value=customer_image) self.add_env("AX_ARTIFACTS_SCRATCH", value=ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self.add_env("AX_POD_NAME", value_from="metadata.name") self.add_env("AX_POD_IP", value_from="status.podIP") self.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace") self.add_env("AX_NODE_NAME", value_from="spec.nodeName") self.add_env("ARGO_LOG_BUCKET_NAME", os.getenv("ARGO_LOG_BUCKET_NAME", "")) self.add_env("ARGO_DATA_BUCKET_NAME", os.getenv("ARGO_DATA_BUCKET_NAME", "")) annotation_vol = ContainerVolume("annotations", "/etc/axspec") annotation_vol.set_type("DOWNWARDAPI", "metadata.annotations") self.add_volume(annotation_vol) # AA-3175: CPU and memory are set to lowest possible so that pod requests are kept at a minimum self.add_resource_constraints("cpu_cores", 0.001) self.add_resource_constraints("mem_mib", 4)
def __init__(self): self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_name = AXClusterId().get_cluster_name() self._cluster_id = AXClusterId().get_cluster_id() self._account = AXCustomerId().get_customer_id() self._sleep_interval = SECONDS_PER_MINUTE self._hourly = SECONDS_PER_HOUR self._daily = SECONDS_PER_DAY self._last_hourly = -self._hourly self._last_daily = -self._daily self._elasticsearch_host = "elasticsearch" logger.debug("AX account: %s cluster_id: %s", self._account, self._cluster_name_id)
def _generate_wait_container_spec(self): main_container_name = self.service.template.name c = SidecarTask(main_container_name, self.software_info.image_namespace, self.software_info.image_version) c.add_env("AX_MAIN_CONTAINER_NAME", value=main_container_name) c.add_env("AX_JOB_NAME", value=self.name) c.add_env("AX_CUSTOMER_ID", AXCustomerId().get_customer_id()) c.add_env("AX_REGION", AXClusterConfig().get_region()) c.add_env("AX_CLUSTER_NAME_ID", self._name_id) c.add_env("ARGO_LOG_BUCKET_NAME", os.environ.get("ARGO_LOG_BUCKET_NAME")) c.add_env("ARGO_DATA_BUCKET_NAME", self._s3_bucket) self._add_optional_envs(c) return c
def __init__(self, name_id): # Account is the source of truth of all S3 config related operations as we # use one bucket per customer account and customer id is the unique identifier # of all buckets related to that customer self._account = AXCustomerId().get_customer_id() # Different configs are stored in different bucket, so set it to None # in base class self._bucket_name = None # We use cluster_name/id/ to separate different directories for different # cluster under same account self._cluster_name_id = name_id # Set a default value but will be overridden by classes inheriting it. External # means these configs are stored in the account that is different than the one # cluster is running in, for example, we upload support logs to a support AWS # account, but upload artifacts to customer account. self._external = False # Bucket exists is a sign to see if the corresponding S3 path are valid (i.e. this # class can be used. For Support, this would point to the support bucket, for cluster # artifacts, this will point to data bucket, etc.) self._bucket_exists = None # We have different naming scheme for GCP and AWS so we parse cluster # name_id in a different way. This series of classes enforces a name_id # be passes as "<cluster_name>-<cluster_id>" if Cloud().target_cloud_gcp(): self._cluster_name, self._cluster_id = AXClusterNameIdParser.parse_cluster_name_id_gcp( name_id) elif Cloud().target_cloud_aws(): self._cluster_name, self._cluster_id = AXClusterNameIdParser.parse_cluster_name_id_aws( name_id) else: assert False, "Invalid cloud provider: {}. Only aws and gcp are supported".format( Cloud().target_cloud()) assert self._cluster_name and self._cluster_id, "Failed to extract cluster name and id from [{}]".format( name_id)
def _upgrade_kube(self): """ This function calls our script to upgrade Kubernetes and cluster nodes :return: """ env = { "CLUSTER_NAME_ID": self._name_id, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "OLD_KUBE_VERSION": self._current_software_info.kube_version, "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version, "NEW_CLUSTER_INSTALL_VERSION": self._cfg.target_software_info.kube_installer_version, "ARGO_AWS_REGION": self._cluster_config.get_region() } if self._cfg.cloud_profile: env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile else: env["ARGO_AWS_PROFILE"] = AWS_DEFAULT_PROFILE logger.info("Upgrading Kubernetes with environments %s", pformat(env)) env.update(os.environ) subprocess.check_call(["upgrade-kubernetes"], env=env)
def _generate_replacing(self): # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid # yaml files, and therefore we construct string manually here trusted_cidr = self._cluster_config.get_trusted_cidr() if isinstance(trusted_cidr, list): trusted_cidr_str = "[" for cidr in trusted_cidr: trusted_cidr_str += "\"{}\",".format(str(cidr)) trusted_cidr_str = trusted_cidr_str[:-1] trusted_cidr_str += "]" else: trusted_cidr_str = "[{}]".format(trusted_cidr) axsys_cpu = 0 axsys_mem = 0 daemon_cpu = 0 daemon_mem = 0 for name in self._kube_objects.keys(): cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage axsys_cpu += cpu axsys_mem += mem daemon_cpu += dcpu daemon_mem += dmem # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not # have a memory request, but this is an approximation) daemon_cpu += 100 daemon_mem += 100 logger.info( "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi", axsys_cpu, axsys_mem, daemon_cpu, daemon_mem) axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = str( int(self._cluster_config.get_min_node_count()) - axsys_node_count) axuser_max_count = str( int(self._cluster_config.get_max_node_count()) - axsys_node_count) autoscaler_scan_interval = str( self._cluster_config.get_autoscaler_scan_interval()) usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["cpu"] usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["memory"] scale_down_util_thresh = round( max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001 logger.info("Setting node scale down utilization threshold to %s", scale_down_util_thresh) self._persist_node_resource_rsvp(daemon_cpu, daemon_mem) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if not asg_name: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # Prepare minion-manager. spot_instances_option = self._cluster_config.get_spot_instances_option( ) minion_manager_asgs = "" if spot_instances_option == SpotInstanceOption.ALL_SPOT: for asg in asg_manager.get_all_asgs(): minion_manager_asgs = minion_manager_asgs + asg[ "AutoScalingGroupName"] + " " minion_manager_asgs = minion_manager_asgs[:-1] elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT: minion_manager_asgs = asg_manager.get_variable_asg( )["AutoScalingGroupName"] return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "ASG_MIN": axuser_min_count, "ASG_MAX": axuser_max_count, "AUTOSCALER_SCAN_INTERVAL": autoscaler_scan_interval, "SCALE_DOWN_UTIL_THRESH": str(scale_down_util_thresh), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key( key=self._cluster_config_path.cluster_metadata()), "ASG_NAME": asg_name, "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "AX_ENABLE_SPOT_INSTANCES": str(spot_instances_option != SpotInstanceOption.NO_SPOT), "AX_SPOT_INSTANCE_ASGS": minion_manager_asgs, }
def prepare_kube_install_config(name_id, aws_profile, cluster_info, cluster_config): """ This function generates kube-up envs. It also add those envs to cluster config :param name_id: :param aws_profile: :param cluster_info: AXClusterInfo object :param cluster_config: AXClusterConfig object :return: """ logger.info("Preparing env for kube-up ...") validate_cluster_node_counts(cluster_config) master_config_env = cluster_config.get_master_config_env() # Need to pass in without env. customer_id = AXCustomerId().get_customer_id() kube_version = os.getenv("AX_KUBE_VERSION") env = { # AWS environments "AWS_IMAGE": cluster_config.get_ami_id(), "KUBERNETES_PROVIDER": cluster_config.get_provider(), "KUBE_AWS_ZONE": cluster_config.get_zone(), "KUBE_AWS_INSTANCE_PREFIX": name_id, "AWS_S3_BUCKET": AXClusterConfigPath(name_id).bucket(), "AWS_S3_REGION": cluster_config.get_region(), "AWS_S3_STAGING_PATH": "kubernetes-staging/v{}".format(kube_version), # Node Configs "AX_CLUSTER_NUM_NODES_MIN": cluster_config.get_min_node_count(), "AXUSER_ON_DEMAND_NUM_NODES": cluster_config.get_axuser_on_demand_count(), "AXUSER_NODE_TYPE": cluster_config.get_axuser_node_type(), "AXSYS_NUM_NODES": cluster_config.get_asxys_node_count(), "AXSYS_NODE_TYPE": cluster_config.get_axsys_node_type(), "MASTER_SIZE": cluster_config.get_master_type(), "AX_VOL_DISK_SIZE": str(cluster_config.get_ax_vol_size()), # Network "KUBE_VPC_CIDR_BASE": cluster_config.get_vpc_cidr_base(), # Cluster identity "AX_CUSTOMER_ID": customer_id, "AX_CLUSTER_NAME_ID": name_id, "AWS_SSH_KEY": cluster_info.get_key_file_path(), "KUBECONFIG": cluster_info.get_kube_config_file_path(), "KUBECTL_PATH": "/opt/google-cloud-sdk/bin/kubectl", } if aws_profile: env["AWS_DEFAULT_PROFILE"] = aws_profile optional_env = { # Start off directly with all spot instances only for dev clusters. "AX_USE_SPOT_INSTANCES": cluster_config.get_spot_instances_option() != "none", "NODE_SPOT_PRICE": cluster_config.get_node_spot_price(), "NODE_SPOT_OPTION": cluster_config.get_spot_instances_option(), "SUBNET_SIZE": cluster_config.get_subnet_size(), "VPC_ID": cluster_config.get_vpc_id() if cluster_config.get_vpc_id() else "", } env.update(default_kube_up_env) # For optional env, set it only if cluster_config has it set. for e in optional_env: val = optional_env[e] if val is not None: if isinstance(val, bool): env.update({e: str(val).lower()}) else: env.update({e: str(val)}) env.update(master_config_env) cluster_config.set_kube_installer_config(config=env) logger.info("Preparing env for kube-up ... DONE") return env
def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src else: var.value = d["value"] rst.append(var) return rst
def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] aws_region = os.environ.get("AX_AWS_REGION", "") if aws_region != "": default_envs.append({"name": "AX_AWS_REGION", "value": aws_region}) if os.getenv("ARGO_S3_ACCESS_KEY_ID", "") != "": # Secrets default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_ID", "secret": "argo-access-key" }) default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_SECRET", "secret": "argo-secret-key" }) default_envs.append({ "name": "ARGO_S3_ENDPOINT", "value": os.getenv("ARGO_S3_ENDPOINT", None) }) # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src elif d.get("secret", None): secret = V1SecretKeySelector() secret.key = d["secret"] secret.name = d["secret"] src = V1EnvVarSource() src.secret_key_ref = secret var.value_from = src else: var.value = d["value"] rst.append(var) return rst