def __init__(self, cfg): assert isinstance(cfg, ClusterRestartConfig) self._cfg = cfg super(ClusterResumer, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32" self._software_info = SoftwareInfo(info_dict=yaml.load( self._cluster_info.download_cluster_software_info()))
def __init__(self, cfg): super(ClusterInstallConfig, self).__init__(cfg) self.cluster_size = cfg.cluster_size self.cluster_type = cfg.cluster_type self.cloud_region = cfg.cloud_region self.cloud_placement = cfg.cloud_placement self.vpc_id = cfg.vpc_id self.vpc_cidr_base = cfg.vpc_cidr_base self.subnet_mask_size = cfg.subnet_mask_size self.trusted_cidrs = cfg.trusted_cidrs self.user_on_demand_nodes = cfg.user_on_demand_nodes self.spot_instances_option = cfg.spot_instances_option self.enable_sandbox = cfg.enable_sandbox self.manifest_root = cfg.service_manifest_root self.bootstrap_config = cfg.platform_bootstrap_config self.autoscaling_interval = cfg.cluster_autoscaling_scan_interval self.support_object_store_name = cfg.support_object_store_name if cfg.software_version_info: # Read software info from config file self.software_info = SoftwareInfo( info_file=cfg.software_version_info) else: # Read software info from envs self.software_info = SoftwareInfo()
def __init__(self, config_file_path): assert os.path.isfile( config_file_path), "Config file {} is not a file".format( config_file_path) self._config_file = config_file_path self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id) if not self._cluster_config.get_cluster_provider().is_user_cluster(): self.cpu_mult, self.mem_mult, self.disk_mult, \ self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers() else: self.cpu_mult = 1 self.mem_mult = 1 self.disk_mult = 1 self.daemon_cpu_mult = 1 self.daemon_mem_mult = 1 self._swagger_components = [] self._yaml_components = [] self._updated_raw = "" # TODO: when we support config software info using a config file, need to figure out how that # file gets passed through, since SoftwareInfo is not a singleton self._software_info = SoftwareInfo() self._load_objects() self._load_raw()
def __init__(self, name, image, pull_policy=None): """ Construct a container that will provide the spec for a kubernetes container http://kubernetes.io/docs/api-reference/v1/definitions/#_v1_container Args: name: name of a container. must be conformant to kubernetes container name image: image for container pull_policy: pull policy based on kubernetes. If None then kubernetes default is used """ self.name = name self.image = image self.image_pull_policy = pull_policy self.command = None self.args = None self.vmap = {} self.env_map = {} self.ports = [] self.resources = None self.privileged = None self.software_info = SoftwareInfo() self.probes = {}
def __init__(self, name, client=None): self.name = name if client is None: self._client = KubernetesApiClient(use_proxy=True) else: self._client = client self._registry_spec = None self._software_info = SoftwareInfo() if self._software_info.registry_is_private(): secret = KubeObjectConfigFile(DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets}) for obj in secret.get_swagger_objects(): if isinstance(obj, swagger_client.V1Secret): self._registry_spec = obj assert self._registry_spec, "Argo registry specification is missing" self._am_service_spec = None self._am_deployment_spec = None # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager elb = InternalRoute("axops", "axsys", client=self._client) elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0] if not elb_status: raise AXPlatformException("Could not get axops elb address {}".format(elb_status)) replacements = {"NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "REGISTRY": self._software_info.registry, "APPLICATION_NAME": self.name, "AXOPS_EXT_DNS": elb_status} cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if not cluster_config.get_cluster_provider().is_user_cluster(): axam_path = DEFAULT_AM_YAML_PATH else: axam_path = "/ax/config/service/argo-all/axam-svc.yml.in" replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv("ARGO_DATA_BUCKET_NAME") logger.info("Using replacements: %s", replacements) k = KubeObjectConfigFile(axam_path, replacements) for obj in k.get_swagger_objects(): if isinstance(obj, swagger_client.V1Service): self._am_service_spec = obj elif isinstance(obj, swagger_client.V1beta1Deployment): self._am_deployment_spec = obj self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True) self._add_pod_metadata("ax_costid", json.dumps({ "app": self.name, "service": "axam-deployment", "user": "******" })) else: logger.debug("Ignoring specification of type {}".format(type(obj))) assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
def __init__(self, cfg): super(ClusterUpgradeConfig, self).__init__(cfg) self.manifest_root = cfg.service_manifest_root self.bootstrap_config = cfg.platform_bootstrap_config self.force_upgrade = cfg.force_upgrade if cfg.software_version_info: # Read software info from config file self.target_software_info = SoftwareInfo( info_file=cfg.software_version_info) else: # Read software info from envs self.target_software_info = SoftwareInfo()
def __init__(self, cfg): assert isinstance(cfg, ClusterUpgradeConfig) self._cfg = cfg super(ClusterUpgrader, self).__init__( cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile ) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region() ) self._current_software_info = SoftwareInfo( info_dict=yaml.load( self._cluster_info.download_cluster_software_info() ) ) self._cidr = str(get_public_ip()) + "/32"
def __init__(self, containername, customer_image, namespace, version): s = SoftwareInfo() super(ArtifactsContainer, self).__init__( containername, "{}/{}/artifacts:{}".format(s.registry, namespace, version)) # artifacts scratch space self._artifacts_scratch = ContainerVolume( "artifacts-scratch", ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self._artifacts_scratch.set_type("EMPTYDIR") self.add_volume(self._artifacts_scratch) # create a hostpath for docker-socket-dir. This is used to for running docker inspect socket_hostpath = ContainerVolume("docker-socket-file", "/var/run/docker.sock") socket_hostpath.set_type("HOSTPATH", "/var/run/docker.sock") self.add_volume(socket_hostpath) # emptydir for sharing for copying static binaries from init container # so that they are available in the main container self._static_bins = ContainerVolume("static-bins", "/copyto") self._static_bins.set_type("EMPTYDIR") self.add_volume(self._static_bins) # add environment vars needed for artifacts self.add_env("AX_TARGET_CLOUD", value=Cloud().target_cloud()) self.add_env("AX_CLUSTER_NAME_ID", value=AXClusterId().get_cluster_name_id()) self.add_env("AX_CUSTOMER_ID", value=AXCustomerId().get_customer_id()) self.add_env("AX_CUSTOMER_IMAGE_NAME", value=customer_image) self.add_env("AX_ARTIFACTS_SCRATCH", value=ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self.add_env("AX_POD_NAME", value_from="metadata.name") self.add_env("AX_POD_IP", value_from="status.podIP") self.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace") self.add_env("AX_NODE_NAME", value_from="spec.nodeName") self.add_env("ARGO_LOG_BUCKET_NAME", os.getenv("ARGO_LOG_BUCKET_NAME", "")) self.add_env("ARGO_DATA_BUCKET_NAME", os.getenv("ARGO_DATA_BUCKET_NAME", "")) annotation_vol = ContainerVolume("annotations", "/etc/axspec") annotation_vol.set_type("DOWNWARDAPI", "metadata.annotations") self.add_volume(annotation_vol) # AA-3175: CPU and memory are set to lowest possible so that pod requests are kept at a minimum self.add_resource_constraints("cpu_cores", 0.001) self.add_resource_constraints("mem_mib", 4)
def __init__(self): super(AXMon, self).__init__() self.version = __version__ self._cluster_cond = threading.Condition() self._shutdown = False self._kubectl = KubernetesApiClient(use_proxy=True) # Initialize SoftwareInfo singleton self._software_info = SoftwareInfo() if Cloud().target_cloud_aws(): # init the volume manager singleton VolumeManager()
def __init__(self, name, namespace="axuser"): self.name = name self.namespace = namespace self.client = KubernetesApiClient(use_proxy=True) self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self.software_info = SoftwareInfo() self._resources = AXResources()
def __init__(self): self.client = KubernetesApiClient(use_proxy=True) self.batchapi = self.client.batchv self.kube_namespace = "axuser" self.jobname = None self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self._attribute_map = {"uuid": "metadata.uid"} self.software_info = SoftwareInfo() self._ax_resources = {}
def __init__(self, name, application): """ Each deployment has a name and needs to be part of an application Application maps to a kubernetes namespace and the deployment will be created in this namespace. Args: name: deployment name application: the application that this deployment runs under """ self.name = name self.application = application self.client = KubernetesApiClient(use_proxy=True) self._nameid = AXClusterId().get_cluster_name_id() self._software_info = SoftwareInfo() self._app_obj = Application(application) self.spec = None
def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {}
template: A stringified template replacements: a dict of string: string Returns: stringified modified template """ return macro_replace(template, replacements) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Managed ELB creator') parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) parser.add_argument('--templates', help='Path to templates', default=".") args = parser.parse_args() dir_name = args.templates software_info = SoftwareInfo() replacements = { "REGISTRY": software_info.registry, "NAMESPACE": software_info.image_namespace, "VERSION": software_info.image_version } print("Macro replacements are {}".format(replacements)) for path, template in load_templates_from_dir(dir_name): print("Processing template {}".format(path)) mod_template = modify_templates(template, replacements) post_template(path, mod_template)
class AXSYSKubeYamlUpdater(object): """ This class loads a kubernetes yaml file, updates resource, and generate objects that kube_object.py can consume """ def __init__(self, config_file_path): assert os.path.isfile( config_file_path), "Config file {} is not a file".format( config_file_path) self._config_file = config_file_path self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id) self.cpu_mult, self.mem_mult, self.disk_mult, \ self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers() self._swagger_components = [] self._yaml_components = [] self._updated_raw = "" # TODO: when we support config software info using a config file, need to figure out how that # file gets passed through, since SoftwareInfo is not a singleton self._software_info = SoftwareInfo() self._load_objects() self._load_raw() @property def updated_raw(self): return self._updated_raw @property def components_in_dict(self): return self._yaml_components @property def components_in_swagger(self): return self._swagger_components def _load_objects(self): with open(self._config_file, "r") as f: data = f.read() for c in yaml.load_all(data): swagger_obj = self._config_yaml(c) yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj) self._swagger_components.append(swagger_obj) self._yaml_components.append(yaml_obj) def _load_raw(self): self._updated_raw = yaml.dump_all(self._yaml_components) def _get_resource_multipliers(self): """ Resources in yaml templates need to be multiplied with these numbers :return: cpu_multiplier, mem_multiplier, disk_multiplier """ # Getting cluster size from cluster config, in order to configure resources # There are 3 situations we will be using AXClusterConfig # - During install, since the class is a singleton, it has all the values we need # no need to download from s3 # - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download # cluster config files from s3 to get the values # - During job creation: the node axmon runs has the proper roles to access s3 try: ax_node_max = int(self._cluster_config.get_asxys_node_count()) ax_node_type = self._cluster_config.get_axsys_node_type() usr_node_max = int( self._cluster_config.get_max_node_count()) - ax_node_max usr_node_type = self._cluster_config.get_axuser_node_type() assert all( [ax_node_max, ax_node_type, usr_node_max, usr_node_type]) except Exception as e: logger.error( "Unable to read cluster config, skip resource config for %s. Error %s", self._config_file, e) return 1, 1, 1, 1, 1 rc = AXSYSResourceConfig( ax_node_type=ax_node_type, ax_node_max=ax_node_max, usr_node_type=usr_node_type, usr_node_max=usr_node_max, cluster_type=self._cluster_config.get_ax_cluster_type()) #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)", # ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file, # rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, # rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier) return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier def _config_yaml(self, kube_yaml_obj): """ Load dict into swagger object, patch resource, sanitize, return a dict :param kube_yaml_obj: :return: swagger object with resource values finalized """ kube_kind = kube_yaml_obj["kind"] (swagger_class_literal, swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind] swagger_obj = ApiClient()._ApiClient__deserialize( kube_yaml_obj, swagger_class_literal) assert isinstance(swagger_obj, swagger_instance), \ "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance) if isinstance(swagger_obj, V1beta1Deployment): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None node_selector = swagger_obj.spec.template.spec.node_selector if node_selector.get('ax.tier', 'applatix') == 'master': # Skip updating containers on master. logger.info( "Skip updating cpu, mem multipliers for pods on master: %s", swagger_obj.metadata.name) else: for container in swagger_obj.spec.template.spec.containers: self._update_container(container) return swagger_obj elif isinstance(swagger_obj, V1Pod): if not self._software_info.registry_is_private(): swagger_obj.spec.image_pull_secrets = None return swagger_obj elif isinstance(swagger_obj, V1beta1DaemonSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None for container in swagger_obj.spec.template.spec.containers: # We are special-casing applet DaemonSet to compromise the fact that # we are using different node type for compute-intense nodes if swagger_obj.metadata.name == "applet": self._update_container(container=container, is_daemon=True, update_resource=True) else: self._update_container(container=container, is_daemon=True, update_resource=False) return swagger_obj elif isinstance(swagger_obj, V1beta1StatefulSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None return self._update_statefulset(swagger_obj) elif isinstance(swagger_obj, V1PersistentVolumeClaim): self._update_volume(swagger_obj) return swagger_obj else: # logger.info("Object %s does not need to configure resource", type(swagger_obj)) # HACK, as the original hook will be messed up if isinstance(swagger_obj, V1Service): if swagger_obj.metadata.name == "axops": swagger_obj.spec.load_balancer_source_ranges = [] for cidr in self._cluster_config.get_trusted_cidr(): # Seems swagger client does not support unicode ... SIGH swagger_obj.spec.load_balancer_source_ranges.append( str(cidr)) # HACK #2: if we don't do this, kubectl will complain about something such as # # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z) # # p.target_port is defined as string though, but if its really a string, kubectl # is looking for a port name, rather than a number # SIGH ... for p in swagger_obj.spec.ports or []: try: p.target_port = int(p.target_port) except (ValueError, TypeError): pass return swagger_obj def _update_deployment_or_daemonset(self, kube_obj): assert isinstance(kube_obj, V1beta1Deployment) or isinstance( kube_obj, V1beta1DaemonSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) return kube_obj def _update_statefulset(self, kube_obj): assert isinstance(kube_obj, V1beta1StatefulSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) if isinstance(kube_obj.spec.volume_claim_templates, list): for vol in kube_obj.spec.volume_claim_templates: self._update_volume(vol) return kube_obj def _update_container(self, container, is_daemon=False, update_resource=True): assert isinstance(container, V1Container) if update_resource: cpulim = container.resources.limits.get("cpu") memlim = container.resources.limits.get("memory") cpureq = container.resources.requests.get("cpu") memreq = container.resources.requests.get("memory") def _massage_cpu(orig): return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult def _massage_mem(orig): return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult if cpulim: rvc = ResourceValueConverter(value=cpulim, target="cpu") rvc.massage(_massage_cpu) container.resources.limits["cpu"] = "{}m".format( rvc.convert("m")) if cpureq: rvc = ResourceValueConverter(value=cpureq, target="cpu") rvc.massage(_massage_cpu) container.resources.requests["cpu"] = "{}m".format( rvc.convert("m")) if memlim: rvc = ResourceValueConverter(value=memlim, target="memory") rvc.massage(_massage_mem) container.resources.limits["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if memreq: rvc = ResourceValueConverter(value=memreq, target="memory") rvc.massage(_massage_mem) container.resources.requests["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if container.liveness_probe and container.liveness_probe.http_get: try: container.liveness_probe.http_get.port = int( container.liveness_probe.http_get.port) except (ValueError, TypeError): pass if container.readiness_probe and container.readiness_probe.http_get: try: container.readiness_probe.http_get.port = int( container.readiness_probe.http_get.port) except (ValueError, TypeError): pass # Add resource multiplier to containers in case we need them if not container.env: container.env = [] container.env += self._generate_default_envs(is_daemon, update_resource) def _update_volume(self, vol): assert isinstance(vol, V1PersistentVolumeClaim) vol_size = vol.spec.resources.requests["storage"] def _massage_disk(orig): return orig * self.disk_mult if vol_size: rvc = ResourceValueConverter(value=vol_size, target="storage") rvc.massage(_massage_disk) # Since AWS does not support value such as 1.5G, lets round up to its ceil vol.spec.resources.requests["storage"] = "{}Gi".format( int(ceil(rvc.convert("Gi")))) # Manually patch access mode as swagger client mistakenly interprets this as map vol.spec.access_modes = ["ReadWriteOnce"] def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src else: var.value = d["value"] rst.append(var) return rst
class Application(object): """ Create an Application which maps to a kubernetes namespace """ def __init__(self, name, client=None): self.name = name if client is None: self._client = KubernetesApiClient(use_proxy=True) else: self._client = client self._registry_spec = None self._software_info = SoftwareInfo() if self._software_info.registry_is_private(): secret = KubeObjectConfigFile( DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets}) for obj in secret.get_swagger_objects(): if isinstance(obj, swagger_client.V1Secret): self._registry_spec = obj assert self._registry_spec, "Argo registry specification is missing" self._am_service_spec = None self._am_deployment_spec = None # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager elb = InternalRoute("axops", "axsys", client=self._client) elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0] if not elb_status: raise AXPlatformException( "Could not get axops elb address {}".format(elb_status)) replacements = { "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "REGISTRY": self._software_info.registry, "APPLICATION_NAME": self.name, "AXOPS_EXT_DNS": elb_status } cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if cluster_config.get_cluster_provider() != ClusterProvider.USER: axam_path = DEFAULT_AM_YAML_PATH else: axam_path = "/ax/config/service/argo-all/axam-svc.yml.in" replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv( "ARGO_DATA_BUCKET_NAME") logger.info("Using replacements: %s", replacements) k = KubeObjectConfigFile(axam_path, replacements) for obj in k.get_swagger_objects(): if isinstance(obj, swagger_client.V1Service): self._am_service_spec = obj elif isinstance(obj, swagger_client.V1beta1Deployment): self._am_deployment_spec = obj self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True) self._add_pod_metadata( "ax_costid", json.dumps({ "app": self.name, "service": "axam-deployment", "user": "******" })) else: logger.debug("Ignoring specification of type {}".format( type(obj))) assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing" def _add_pod_metadata(self, key, value, is_label=False): """ Helper function to add metadata to deployment pod spec for AXAM """ pod_meta = self._am_deployment_spec.spec.template.metadata if is_label: if pod_meta.labels is None: pod_meta.labels = {} pod_meta.labels[key] = value else: if pod_meta.annotations is None: pod_meta.annotations = {} pod_meta.annotations[key] = value def create(self, force_recreate=False): """ Create a kubernetes namespace and populate it with argo registry Idempotency: This function will be idempotent as long as the content of the secret is not changed. If create is called with a registry secret that has been updated and the namespace with the secret already exists then it will not update the secret for now. """ @retry_not_exists def create_ns_in_provider(): namespace = swagger_client.V1Namespace() namespace.metadata = swagger_client.V1ObjectMeta() namespace.metadata.name = self.name self._client.api.create_namespace(namespace) # NOTE: 403 is not retried as application is getting deleted in parallel # 422 is unprocessable object (aka error in spec) @retry_unless(status_code=[403, 422]) def create_reg_in_provider(): if self._registry_spec is None: return try: self._client.api.create_namespaced_secret( self._registry_spec, self.name) except swagger_client.rest.ApiException as e: if e.status == 409: self._client.api.patch_namespaced_secret( self._registry_spec.to_dict(), self.name, self._registry_spec.metadata.name) else: raise e @retry_unless(status_code=[403, 422]) def create_app_monitor_service_in_provider(): try: self._client.api.create_namespaced_service( self._am_service_spec, self.name) except swagger_client.rest.ApiException as e: if e.status == 409: self._client.api.patch_namespaced_service( self._am_service_spec.to_dict(), self.name, self._am_service_spec.metadata.name) else: raise e @retry_unless(status_code=[403, 422]) def create_app_monitor_deployment_in_provider(): try: self._client.apisappsv1beta1_api.create_namespaced_deployment( self._am_deployment_spec, self.name) except swagger_client.rest.ApiException as e: if e.status == 409: if force_recreate: # add a new metadata in pod spec to force the recreation of pods self._add_pod_metadata( "applatix.io/force-recreate-salt", str(uuid.uuid4())) self._client.apisappsv1beta1_api.replace_namespaced_deployment( self._am_deployment_spec, self.name, self._am_deployment_spec.metadata.name) else: raise e try: logger.debug("Creating application {}".format(self.name)) create_ns_in_provider() logger.debug("Created namespace {}".format(self.name)) create_reg_in_provider() create_app_monitor_service_in_provider() logger.debug("Created application monitor service {}".format( self._am_service_spec.metadata.name)) create_app_monitor_deployment_in_provider() logger.debug("Created application monitor deployment {}".format( self._am_deployment_spec.metadata.name)) except Exception as e: logger.exception(e) def delete(self, timeout=None): """ Delete a kubernetes namespace and image secret for Argo Idempotency: Can be repeatedly called """ delete_grace_period = 1 options = swagger_client.V1DeleteOptions() options.grace_period_seconds = delete_grace_period options.orphan_dependents = False @retry_unless(swallow_code=[404, 409]) def delete_ns_in_provider(): """ The retry is not done for 404 (not found) and also for 409 (conflict) The 404 case is for simple retry. 409 happens when application delete was requested but not complete and another request came in. """ logger.debug("Deleting application {}".format(self.name)) self._client.api.delete_namespace(options, self.name) delete_ns_in_provider() start_time = time.time() while self.exists(): logger.debug("Application {} still exists".format(self.name)) time.sleep(delete_grace_period + 1) wait_time = int(time.time() - start_time) if timeout is not None and wait_time > timeout: raise AXTimeoutException( "Could not delete namespace {} in {} seconds".format( self.name, timeout)) def exists(self): @retry_unless_not_found def get_ns_in_provider(): try: stat = self._client.api.read_namespace(self.name) return True except swagger_client.rest.ApiException as e: if e.status == 404: return False else: raise e return get_ns_in_provider() def status(self): """ This function checks the following: 1. Namespace exists? 2. Argo Registry exists? 3. TODO: Application Monitor exists Returns: A json dict with the status of each { 'namespace': True/False, 'registry': True/False, 'monitor': True/False } """ ret = {'namespace': False, 'registry': False, 'monitor': False} if not self.exists(): return ret ret['namespace'] = True ns = self._get_registry_from_provider() if ns is None: return ret ret['registry'] = True srv = self._get_am_service_from_provider() if srv is None: return ret am_dep = self._get_am_deployment_from_provider() if am_dep is not None and am_dep.status.available_replicas == am_dep.status.replicas: ret["monitor"] = True return ret def healthy(self): """ If all components are present/healthy then return True else return False """ d = self.status() for component in d: if not d[component]: return False return True def events(self, name=None): return self._get_events_from_provider(name).items @retry_unless(swallow_code=[404]) def _get_registry_from_provider(self): if self._registry_spec is not None: return self._client.api.read_namespaced_secret( self.name, self._registry_spec.metadata.name) else: return "NotNeeded" @retry_unless(swallow_code=[404]) def _get_am_service_from_provider(self): return self._client.api.read_namespaced_service( self.name, self._am_service_spec.metadata.name) @retry_unless(swallow_code=[404]) def _get_am_deployment_from_provider(self): return self._client.apisappsv1beta1_api.read_namespaced_deployment( self.name, self._am_deployment_spec.metadata.name) @retry_unless(swallow_code=[404]) def _get_events_from_provider(self, name): # XXX: For some reason list_namespaced_event does not take a namespace but the _21 version # of the function does. Hopefully this gets fixed in swagger soon field_selector = None if name is not None: field_selector = "involvedObject.name={}".format(name) return self._client.api.list_namespaced_event( self.name, field_selector=field_selector)
class ClusterResumer(ClusterOperationBase): def __init__(self, cfg): assert isinstance(cfg, ClusterRestartConfig) self._cfg = cfg super(ClusterResumer, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32" self._software_info = SoftwareInfo(info_dict=yaml.load( self._cluster_info.download_cluster_software_info())) def pre_run(self): if self._cluster_info.is_cluster_supported_by_portal(): raise RuntimeError( "Cluster is currently supported by portal. Please login to portal to perform cluster management operations." ) if self._csm.is_running(): logger.info("Cluster is already running.") sys.exit(0) if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"): raise RuntimeError( "Cluster is not successfully installed: Stage2 information missing! Operation aborted." ) self._csm.do_resume() self._persist_cluster_state_if_needed() def post_run(self): self._csm.done_resume() self._persist_cluster_state_if_needed() def run(self): if self._cfg.dry_run: logger.info("DRY RUN: Resuming cluster %s with software info %s", self._name_id, self._software_info.to_dict()) return logger.info("%s\n\nResuming cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) # Main resume cluster routine try: self._master_manager.restart_master() self._recover_auto_scaling_groups() self._wait_for_master() self._ensure_restarter_access() self._wait_for_minions() ensure_manifest_temp_dir() self._start_platform() logger.info("\n\n%sSuccessfully resumed cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) except Exception as e: logger.exception(e) raise RuntimeError(e) finally: self._disallow_restarter_access_if_needed() def _start_platform(self): """ This step brings up Argo platform services :return: """ logger.info("Bringing up Argo platform ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH, software_info=self._software_info) platform.start() platform.stop_monitor() def _wait_for_master(self): """ This step waits for master to be up and running :return: """ count = 0 running_master = None while count < WAIT_FOR_RUNNING_MASTER_RETRY: logger.info( "Waiting for master to be up and running. Trail %s / %s", count, WAIT_FOR_RUNNING_MASTER_RETRY) running_master = self._master_manager.discover_master( state=[EC2InstanceState.Running]) if not running_master: time.sleep(5) else: logger.info("%sMaster %s is running%s", COLOR_GREEN, running_master, COLOR_NORM) break count += 1 if count == WAIT_FOR_RUNNING_MASTER_RETRY: raise RuntimeError( "Timeout waiting for master {} to come up. Please manually check cluster status" .format(running_master)) def _wait_for_minions(self): """ This step waits for all minions to come up and registered in Kubernetes master :return: """ # Get kubernetes access token self._cluster_info.download_kube_config() kube_config = self._cluster_info.get_kube_config_file_path() # Wait for nodes to be ready. # Because we made sure during pause that kubernetes master already knows that all minions are gone, # we don't need to worry about cached minions here logger.info("Wait 120 seconds before Kubernetes master comes up ...") time.sleep(120) kubectl = KubernetesApiClient(config_file=kube_config) logger.info("Waiting for all Kubelets to be ready ...") trail = 0 while True: try: all_kubelets_ready = True nodes = kubectl.api.list_node() logger.info("%s / %s nodes registered", len(nodes.items), self._total_nodes) if len(nodes.items) < self._total_nodes: all_kubelets_ready = False else: for n in nodes.items: kubelet_check = { "KubeletHasSufficientDisk", "KubeletHasSufficientMemory", "KubeletHasNoDiskPressure", "KubeletReady", "RouteCreated" } for cond in n.status.conditions: if cond.reason in kubelet_check: kubelet_check.remove(cond.reason) if kubelet_check: logger.info( "Node %s not ready yet. Remaining Kubelet checkmarks: %s", n.metadata.name, kubelet_check) all_kubelets_ready = False break else: logger.info("Node %s is ready.", n.metadata.name) if all_kubelets_ready: logger.info("All Kubelets are ready") break except Exception as e: if "Max retries exceeded" in str(e): # If master API server is still not ready at this moment, we don't count as a trail trail -= 1 logger.info("Kubernetes API server not ready yet") else: logger.exception("Caught exception when listing nodes: %s", e) trail += 1 if trail > WAIT_FOR_MINION_REG_RETRY: raise RuntimeError( "Timeout waiting for minions to come up. Please manually check cluster status" ) time.sleep(10) def _recover_auto_scaling_groups(self): """ This steps does the following: - fetch the previously restored auto scaling group config. If this config cannot be found, we can assume that all autoscaling groups have correct configurations. This could happen when previous restart failed in the middle but passed this stage already, or the cluster is not even paused - Wait for all instances to be in service :return: """ # Get previously persisted asg status logger.info("Fetching last cluster status ...") cluster_status_raw = self._cluster_info.download_cluster_status_before_pause( ) asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) if cluster_status_raw: logger.info("Found last cluster status, restoring cluster ...") cluster_status = yaml.load(cluster_status_raw) all_asg_statuses = cluster_status["asg_status"] # Restore minions for asg_name in all_asg_statuses.keys(): asg_status = all_asg_statuses[asg_name] min_size = asg_status["min_size"] max_size = asg_status["max_size"] desired = asg_status["desired_capacity"] self._total_nodes += desired logger.info( "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s", asg_name, min_size, max_size, desired) asg_mgr.set_asg_spec(name=asg_name, minsize=min_size, maxsize=max_size, desired=desired) logger.info("Waiting for all auto scaling groups to scale up ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster instances are in service%s", COLOR_GREEN, COLOR_NORM) # Delete previously stored cluster status self._cluster_info.delete_cluster_status_before_pause() else: all_asgs = asg_mgr.get_all_asgs() for asg in all_asgs: self._total_nodes += asg["DesiredCapacity"] logger.info( "Cannot find last cluster status, cluster already resumed with %s nodes", self._total_nodes) def _ensure_restarter_access(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Restarting cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[], new_cidr=[self._cidr], action_name="allow-cluster-manager") def _disallow_restarter_access_if_needed(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Restarting cluster from a not trusted IP (%s). Disallowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[self._cidr], new_cidr=[], action_name="disallow-cluster-manager")
class Container(KubeObject): """ Class for creating container specifications """ LIVENESS_PROBE = 1 READINESS_PROBE = 2 def __init__(self, name, image, pull_policy=None): """ Construct a container that will provide the spec for a kubernetes container http://kubernetes.io/docs/api-reference/v1/definitions/#_v1_container Args: name: name of a container. must be conformant to kubernetes container name image: image for container pull_policy: pull policy based on kubernetes. If None then kubernetes default is used """ self.name = name self.image = image self.image_pull_policy = pull_policy self.command = None self.args = None self.vmap = {} self.env_map = {} self.ports = [] self.resources = None self.privileged = None self.software_info = SoftwareInfo() self.probes = {} def generate_spec(self): c = swagger_client.V1Container() c.name = self.name c.image = self.image if self.resources is not None: c.resources = swagger_client.V1ResourceRequirements() c.resources.requests = {} c.resources.limits = {} if "cpu_cores" in self.resources: c.resources.requests["cpu"] = str( self.resources["cpu_cores"][0]) if self.resources["cpu_cores"][1] is not None: c.resources.limits["cpu"] = str( self.resources["cpu_cores"][1]) if "mem_mib" in self.resources: c.resources.requests["memory"] = "{}Mi".format( self.resources["mem_mib"][0]) if self.resources["mem_mib"][1] is not None: c.resources.limits["memory"] = "{}Mi".format( self.resources["mem_mib"][1]) # Kubernetes 1.5 requires init container must specify image pull policy. Since we are setting # a pull policy for all containers, we want to replicate the kubernetes default behavior of pulling # the image if tag is "latest" if self.image.endswith(':latest'): c.image_pull_policy = ContainerImagePullPolicy.PullAlways else: c.image_pull_policy = self.image_pull_policy or ContainerImagePullPolicy.PullIfNotPresent if self.command: c.command = self.command if self.args: c.args = self.args c.volume_mounts = [] for _, vol in self.vmap.iteritems(): c.volume_mounts.append(vol.get_container_spec()) c.env = [] for _, env in self.env_map.iteritems(): c.env.append(env) if self.privileged is not None: c.security_context = swagger_client.V1SecurityContext() c.security_context.privileged = self.privileged for probe in self.probes: probe_spec = self.probes[probe] probe_k8s_spec = Container._generate_probe_spec(probe_spec) if probe == Container.LIVENESS_PROBE: c.liveness_probe = probe_k8s_spec elif probe == Container.READINESS_PROBE: c.readiness_probe = probe_k8s_spec else: raise AXIllegalArgumentException( "Unexpected probe type {} found with spec {}".format( probe, probe_spec)) return c def add_resource_constraints(self, resource, request, limit=None): if self.resources is None: self.resources = {} self.resources[resource] = (request, limit) def add_volume(self, volume): self.vmap[volume.name] = volume def add_volumes(self, volumes): for vol in volumes or []: self.add_volume(vol) def get_volume(self, name): return self.vmap.get(name, None) def add_env(self, name, value=None, value_from=None): env = swagger_client.V1EnvVar() env.name = name if value is not None: env.value = value else: assert value_from is not None, "value and value_from both cannot be None for env {}".format( name) env.value_from = swagger_client.V1EnvVarSource() env.value_from.field_ref = swagger_client.V1ObjectFieldSelector() env.value_from.field_ref.field_path = value_from # Some 1.5 requires this. https://github.com/kubernetes/kubernetes/issues/39189 env.value_from.field_ref.api_version = "v1" self.env_map[name] = env def add_probe(self, probe_type, probe_spec): self.probes[probe_type] = probe_spec def parse_probe_spec(self, container_template): """ @type container_template: argo.template.v1.container.ContainerTemplate """ if container_template.liveness_probe: probe_type = Container.LIVENESS_PROBE self.add_probe(probe_type, container_template.liveness_probe) if container_template.readiness_probe: probe_type = Container.READINESS_PROBE self.add_probe(probe_type, container_template.readiness_probe) def get_registry(self, namespace="axuser"): """ This function returns the name of the secrets file that needs to be used in the pod specification image_pull_secrets array """ (reg, _, _) = DockerImage(fullname=self.image).docker_names() if reg == self.software_info.registry: if self.software_info.registry_is_private(): return "applatix-registry" else: return None else: try: smanager = SecretsManager() secret = smanager.get_imgpull(reg, namespace) if secret: return secret.metadata.name # Code for copying the registry to the app namespace if # it does not exist. We do not copy to axuser as secrets # are always created there. secret_axuser = smanager.get_imgpull(reg, "axuser") if secret_axuser and namespace != "axuser": smanager.copy_imgpull(secret_axuser, namespace) return secret_axuser.metadata.name except Exception as e: logger.debug( "Did not find a secret for registry {} due to exception {}" .format(reg, e)) return None def volume_iterator(self): for _, vol in self.vmap.iteritems(): yield vol @staticmethod def _generate_probe_spec(spec): """ @type spec argo.template.v1.container.ContainerProbe """ try: probe = swagger_client.V1Probe() probe.initial_delay_seconds = spec.initial_delay_seconds probe.timeout_seconds = spec.timeout_seconds probe.period_seconds = spec.period_seconds probe.failure_threshold = spec.failure_threshold probe.success_threshold = spec.success_threshold if spec.exec_probe: action = swagger_client.V1ExecAction() action.command = shlex.split(spec.exec_probe.command) probe._exec = action return probe elif spec.http_get: action = swagger_client.V1HTTPGetAction() action.path = spec.http_get.path action.port = spec.http_get.port headers = spec.http_get.http_headers action.http_headers = [] for header in headers or []: h = swagger_client.V1HTTPHeader() h.name = header["name"] h.value = header["value"] action.http_headers.append(h) probe.http_get = action return probe else: logger.debug("Cannot handle probe {}".format(spec)) except Exception as e: raise AXIllegalArgumentException( "Probe {} cannot be processed due to error {}".format(spec, e)) return None