def __init__(self, name, client=None): self.name = name if client is None: self._client = KubernetesApiClient(use_proxy=True) else: self._client = client self._registry_spec = None self._software_info = SoftwareInfo() if self._software_info.registry_is_private(): secret = KubeObjectConfigFile(DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets}) for obj in secret.get_swagger_objects(): if isinstance(obj, swagger_client.V1Secret): self._registry_spec = obj assert self._registry_spec, "Argo registry specification is missing" self._am_service_spec = None self._am_deployment_spec = None # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager elb = InternalRoute("axops", "axsys", client=self._client) elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0] if not elb_status: raise AXPlatformException("Could not get axops elb address {}".format(elb_status)) replacements = {"NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "REGISTRY": self._software_info.registry, "APPLICATION_NAME": self.name, "AXOPS_EXT_DNS": elb_status} cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if not cluster_config.get_cluster_provider().is_user_cluster(): axam_path = DEFAULT_AM_YAML_PATH else: axam_path = "/ax/config/service/argo-all/axam-svc.yml.in" replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv("ARGO_DATA_BUCKET_NAME") logger.info("Using replacements: %s", replacements) k = KubeObjectConfigFile(axam_path, replacements) for obj in k.get_swagger_objects(): if isinstance(obj, swagger_client.V1Service): self._am_service_spec = obj elif isinstance(obj, swagger_client.V1beta1Deployment): self._am_deployment_spec = obj self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True) self._add_pod_metadata("ax_costid", json.dumps({ "app": self.name, "service": "axam-deployment", "user": "******" })) else: logger.debug("Ignoring specification of type {}".format(type(obj))) assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
def get_spec(self): # generate the metadata metadata = swagger_client.V1ObjectMeta() metadata.name = self.name metadata.annotations = { "pod.beta.kubernetes.io/init-containers": self._init_containers_spec() } for a in self.annotations: metadata.annotations[a] = self.annotations[a] metadata.labels = {} for l in self.labels: metadata.labels[l] = self.labels[l] # generate the pod specification pspec = swagger_client.V1PodSpec() if self.hostname: pspec.hostname = self.hostname pspec.containers = [] if "wait" in self.cmap: pspec.containers.append(self.cmap["wait"].generate_spec()) assert "main" in self.cmap, "Pod specification cannot be generated without a main container" pspec.containers.append(self.cmap["main"].generate_spec()) if "dind" in self.cmap: pspec.containers.append(self.cmap["dind"].generate_spec()) pspec.image_pull_secrets = self._build_image_pull_secrets() pspec.volumes = self._volume_spec() if self.restart_policy is not None: pspec.restart_policy = self.restart_policy cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if not cluster_config.get_cluster_provider().is_user_cluster(): pspec.node_selector = {"ax.tier": self._tier} # finalize the pod template spec spec = swagger_client.V1PodTemplateSpec() spec.metadata = metadata spec.spec = pspec return spec
class AXPlatform(object): def __new__(cls, *args, **kwargs): if Cloud().target_cloud_gcp(): from .gke_platform import AXGKEPlatform return super(AXPlatform, cls).__new__(AXGKEPlatform) else: return super(AXPlatform, cls).__new__(cls) def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {} def _load_kube_objects_from_steps(self, steps): """ Extract kube objects from steps in config, and load them into memory :param steps: list :return: """ for object_group in steps: assert isinstance(object_group, AXPlatformObjectGroup) for obj in object_group.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name filename = obj.manifest namespace = obj.namespace if name in self._kube_objects: raise ValueError("Duplicated object name {}".format(name)) kubeobj_conf_path = os.path.join(self._manifest_root, filename) self._kube_objects[name] = KubeObject( config_file=kubeobj_conf_path, kubepoll=self.kube_poll, replacing=None, kube_config=self._kube_config, kube_namespace=namespace) def _get_trusted_cidr_str(self): trusted_cidr = self._cluster_config.get_trusted_cidr() if isinstance(trusted_cidr, list): trusted_cidr_str = "[" for cidr in trusted_cidr: trusted_cidr_str += "\"{}\",".format(str(cidr)) trusted_cidr_str = trusted_cidr_str[:-1] trusted_cidr_str += "]" else: trusted_cidr_str = "[{}]".format(trusted_cidr) return trusted_cidr_str def _generate_replacing_for_user_provisioned_cluster(self): trusted_cidr_str = self._get_trusted_cidr_str() self._persist_node_resource_rsvp(0, 0) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key( key=self._cluster_config_path.cluster_metadata()), "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "ARGO_DATA_BUCKET_NAME": AXClusterConfigPath(self._cluster_name_id).bucket(), "LOAD_BALANCER_TYPE": "LoadBalancer", "ARGO_S3_ACCESS_KEY_ID": base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_ID", "")), "ARGO_S3_ACCESS_KEY_SECRET": base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_SECRET", "")), } def _generate_replacing(self): # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid # yaml files, and therefore we construct string manually here trusted_cidr_str = self._get_trusted_cidr_str() axsys_cpu = 0 axsys_mem = 0 daemon_cpu = 0 daemon_mem = 0 for name in self._kube_objects.keys(): cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage axsys_cpu += cpu axsys_mem += mem daemon_cpu += dcpu daemon_mem += dmem # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not # have a memory request, but this is an approximation) daemon_cpu += 100 daemon_mem += 100 logger.info( "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi", axsys_cpu, axsys_mem, daemon_cpu, daemon_mem) axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = str( int(self._cluster_config.get_min_node_count()) - axsys_node_count) axuser_max_count = str( int(self._cluster_config.get_max_node_count()) - axsys_node_count) autoscaler_scan_interval = str( self._cluster_config.get_autoscaler_scan_interval()) usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["cpu"] usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["memory"] scale_down_util_thresh = round( max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001 logger.info("Setting node scale down utilization threshold to %s", scale_down_util_thresh) self._persist_node_resource_rsvp(daemon_cpu, daemon_mem) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if not asg_name: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # Prepare minion-manager. spot_instances_option = self._cluster_config.get_spot_instances_option( ) minion_manager_asgs = "" if spot_instances_option == SpotInstanceOption.ALL_SPOT: for asg in asg_manager.get_all_asgs(): minion_manager_asgs = minion_manager_asgs + asg[ "AutoScalingGroupName"] + " " minion_manager_asgs = minion_manager_asgs[:-1] elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT: minion_manager_asgs = asg_manager.get_variable_asg( )["AutoScalingGroupName"] return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "ASG_MIN": axuser_min_count, "ASG_MAX": axuser_max_count, "AUTOSCALER_SCAN_INTERVAL": autoscaler_scan_interval, "SCALE_DOWN_UTIL_THRESH": str(scale_down_util_thresh), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key( key=self._cluster_config_path.cluster_metadata()), "ASG_NAME": asg_name, "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "AX_ENABLE_SPOT_INSTANCES": str(spot_instances_option != SpotInstanceOption.NO_SPOT), "AX_SPOT_INSTANCE_ASGS": minion_manager_asgs, } def _persist_node_resource_rsvp(self, user_node_daemon_cpu, user_node_daemon_mem): self._cluster_config.set_user_node_resource_rsvp( cpu=user_node_daemon_cpu, mem=user_node_daemon_mem) self._cluster_config.save_config() def start(self): """ Bring up platform using "platform-start.cfg" configuration from manifest directory :return: """ # Generate kube-objects steps = self._config.steps self._load_kube_objects_from_steps(steps) if self._cluster_config.get_cluster_provider() != ClusterProvider.USER: self._replacing = self._generate_replacing() else: self._replacing = self._generate_replacing_for_user_provisioned_cluster( ) logger.debug("Replacing ENVs: %s", self._replacing) # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32) # At this moment, we MUST separate first step due to the above dependency assert len(steps) >= 2, "Should have at least 1 step to create axops" self.create_objects(steps[0]) self.create_objects(steps[1]) self.create_objects(steps[2]) # Prepare axops_eip if self._cluster_config.get_provider() != "minikube": self._set_ext_dns() info_bound = "=======================================================\n" img_namespace = "Image Namespace: {}\n".format( self._software_info.image_namespace) img_version = "Image Version: {}\n".format( self._software_info.image_version) start_info = "\n\n{}{}{}{}{}".format( info_bound, "Platform Up: Bringing up Argo services...\n", img_namespace, img_version, info_bound) logger.info(start_info) # Start rest of the objects for i in range(3, len(steps)): self.create_objects(steps[i]) # update application namespace logger.info("Updating application managers") for app in Applications(client=self.kubectl).list(): logger.info("--- updating {}".format(app)) a = Application(app, client=self.kubectl) a.create(force_recreate=True) logger.info("Done updating application managers") # Upload version information to target cluster self._update_version() logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n", COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name, COLOR_NORM) def stop(self): """ Bring down platform using "platform-stop.cfg" configuration from manifest directory :return: """ # Generate kube-objects (Does not need to generate replacing during platform down) # Stop order should be the reverse of start steps = self._config.steps steps.reverse() self._load_kube_objects_from_steps(steps) info_bound = "=======================================================\n" stop_info = "\n\n{}{}{}".format( info_bound, "Platform Down: Shutting down Argo services...\n", info_bound) logger.info(stop_info) # Bring down objects according to steps for i in range(len(steps)): object_group = steps[i] self.delete_objects(object_group) def stop_monitor(self): self._monitor.stop() def create_objects(self, objects): """ Start kubernetes objects based on records. Wait for all of them. :param objects: AXPlatformObjectGroup """ if objects is None or len(objects.object_set) == 0: return assert isinstance(objects, AXPlatformObjectGroup) if not self._should_create_group( policy=objects.policy, policy_predicate=objects.policy_predicate, consistency=objects.consistency): logger.debug( "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)", objects.name, objects.policy, objects.policy_predicate, objects.consistency) return logger.info("Create step: %s", objects.name) logger.info("Creating platform objects\n\n%s", self._generate_object_summary(objects.object_set)) pool = ThreadPool(len(objects.object_set)) async_results = {} for obj in objects.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name namespace = obj.namespace async_results[name] = pool.apply_async( self.start_one, args=(name, ), kwds={"namespace": namespace}) pool.close() pool.join() report, failed = self._generate_report(async_results, "Create") logger.info(report) if failed: raise AXPlatformException("Failed to create platform objects.") def _should_create_group(self, policy, policy_predicate, consistency): """ Take AXPlatformObjectGroup policy, predicate and consistency and determine if this group should be created or not :param policy: :param policy_predicate: :param consistency: :return: """ # Since we are not using consistency, we should always create if not # explicitly told not to, i.e. if there is a PrivateRegistryOnly # We are just leaving the interface here that should create or not # need to be decided by policy, policy_predicate and consistency if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \ not self._software_info.registry_is_private(): return False return True def delete_objects(self, objects): """ Stop kubernetes objects based on records. Wait for all of them. :param objects: AXPlatformObjectGroup """ assert isinstance(objects, AXPlatformObjectGroup) if not self._should_delete_group( policy=objects.policy, policy_predicate=objects.policy_predicate): logger.debug( "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)", objects.name, objects.policy, objects.policy_predicate) return logger.info("Delete step: %s", objects.name) logger.info("Deleting platform objects\n\n%s.", self._generate_object_summary(objects.object_set)) pool = ThreadPool(len(objects.object_set)) async_results = {} for obj in objects.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name namespace = obj.namespace async_results[name] = pool.apply_async( self.stop_one, args=(name, ), kwds={"namespace": namespace}) pool.close() pool.join() report, failed = self._generate_report(async_results, "Delete") logger.info(report) if failed: raise AXPlatformException("Failed to create platform objects.") def _should_delete_group(self, policy, policy_predicate): """ Take AXPlatformObjectGroup policy and determine if this group should be deleted or not. Consistency is not needed for deletion :param policy: :param policy_predicate: :return: """ if policy == ObjectGroupPolicy.CreateMany: return True return False def start_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Creating %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] # Update them as there are new updates in replacing in platform start kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } if kube_obj.healthy(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS) ] result["duration"] = str(round(time.time() - start, 2)) return result # Previous platform start might fail, and might result in some componenets created # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing # object and try to create a new one if kube_obj.exists(): logger.warning( "Object %s exists but not healthy. Deleting object for idempotency ...", name) self.stop_one(name, namespace) assert not kube_obj.exists( ), "Kubeobject {} already created but is not healthy. Not Expected".format( name) monitor_info = kube_obj.get_create_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout, waiter) # Call kubectl create kube_obj.create() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details result["code"].append("{:.25s}:{}".format( wait_info["name"], waiter.result)) if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: logger.info("Successfully created %s with code %s.", wait_info["name"], waiter.result) else: result["failed"] = True logger.error( "Failed to create %s in %s with code %s. Events: %s", wait_info["name"], namespace, waiter.result, str(waiter.details)) if not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) result["code"] += del_rst["code"] result["events"] += del_rst["events"] result["duration"] = str(round(time.time() - start, 2)) return result # Poll extra if required (for Petset and Deployments with multiple replicas) if kube_obj.extra_poll: logger.info( "Polling till healthy to make sure rest of components of %s are up and running ...", name) create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults. ObjCreateExtraPollInterval, poll_max_retry=AXPlatformConfigDefaults. ObjCreateExtraPollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst # Poll once to confirm all components from this Kubernetes config file exist, # In case there are objects in this config file cannot be monitored, i.e. svc # without elb. This is really not expected so we don't delete it if not kube_obj.healthy(): logger.error( "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.", name) result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNHEALTHY)) result["failed"] = True result["events"].append( "Object {} created byt is not healthy".format(name)) result["duration"] = str(round(time.time() - start, 2)) if not result["failed"]: logger.info("Successfully created object %s.", name) return result else: # use polling kube_obj.create() create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst @staticmethod def _poll_till_healthy(name, kube_obj, start_time, poll_interval, poll_max_retry, rst): trail = 0 assert isinstance(kube_obj, KubeObject) while True: if not kube_obj.healthy(): trail += 1 if trail > poll_max_retry: logger.error("Failed to create KubeObject %s", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY) ] rst["events"] += [ "Object {} creation timeout. Not healthy".format(name) ] rst["failed"] = True rst["duration"] = str(round(time.time() - start_time, 2)) return rst else: logger.info("Successfully created %s.", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.OK) ] rst["failed"] = False rst["duration"] = str(round(time.time() - start_time, 2)) return rst time.sleep(poll_interval) def stop_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Deleting %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } # Don't delete if object does not exist if not kube_obj.exists(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED) ] result["duration"] = str(round(time.time() - start, 2)) return result monitor_info = kube_obj.get_delete_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout, waiter) # Call kubectl delete kube_obj.delete() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.DELETED)) logger.info("Successfully deleted %s in %s with code %s.", wait_info["name"], name, result["code"]) else: result["failed"] = True result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.UNKNOWN)) logger.error( "Failed to delete %s in %s with code %s. Events: %s", wait_info["name"], name, result["code"], str(waiter.details)) # Poll once to confirm all components from this Kubenetes config file exist # In case there are objects in this config file cannot be monitored, i.e. svc without elb if kube_obj.exists(): logger.error("Object %s deleted but still exists", name) result["failed"] = True result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNKNOWN)) result["events"].append( "Object {} deleted but still exists.".format(name)) result["duration"] = str(round(time.time() - start, 2)) logger.info("Successfully deleted %s.", name) return result else: # use polling kube_obj.delete() return self._poll_till_not_exists( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry, rst=result) @staticmethod def _poll_till_not_exists(name, kube_obj, start_time, poll_interval, poll_max_retry, rst): trail = 0 assert isinstance(kube_obj, KubeObject) while True: if kube_obj.exists(): trail += 1 if trail > poll_max_retry: logger.error("Failed to delete KubeObject %s", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN) ] rst["events"] += [ "Object {} deletion timeout. Please manually check remaining pods" .format(name) ] rst["failed"] = True rst["duration"] = str(round(time.time() - start_time, 2)) return rst else: logger.info("Successfully deleted %s.", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED) ] rst["failed"] = False rst["duration"] = str(round(time.time() - start_time, 2)) return rst time.sleep(poll_interval) def _generate_object_summary(self, objects): """ :param objects: list of AXPlatformObject :return: """ report_title = "\n{:25s} | {:110s} | {:20s}\n".format( "NAME", "MANIFEST", "NAMESPACE") report_bar = "{}\n".format("-" * 174) content = "" for obj in objects: assert isinstance(obj, AXPlatformObject) name = obj.name filename = os.path.join(self._manifest_root, obj.manifest) namespace = obj.namespace content += "{:25s} | {:110s} | {:20s}\n".format( name, filename, namespace) return report_title + report_bar + content @staticmethod def _generate_report(results, operation): failed = False report_body = "" warnings = "\n======= WARNING EVENTS =======\n" for name in results.keys(): individual_report = "{:25s} | {:110s} | {:20s}\n" individual_warning = "{name}: {events}\n\n" try: result = results[name].get() if result["failed"]: failed = True code = result["code"][0] for c in result["code"][1:]: code += " / {}".format(c) individual_report = individual_report.format( name, code, result["duration"], 2) if len(result["events"]) > 0: warnings += individual_warning.format( name=name, events=str(result["events"])) except Exception as e: failed = True logger.exception(str(e)) individual_report = individual_report.format( name, "EXCEPTION", "UNKNOWN") warnings += individual_warning.format(name=name, events=str(e)) report_body += individual_report report_head = "\n\nPlatform {} {}. Report:\n".format( operation, "FAILED" if failed else "SUCCESSFULLY") report_title = "\n{:25s} | {:110s} | {:20s}\n".format( "NAME", "STATUS", "TIME (sec)") report_bar = "{}\n".format("-" * 174) return "{}{}{}{}{}{}".format( report_head, report_title, report_bar, report_body, warnings, "==============================\n"), failed def _get_eip_from_config_map(self): try: cmd = [ "kubectl", "get", "configmap", "cluster-dns-name", "-o", "yaml", "--namespace", self.kube_axsys_namespace, "--kubeconfig", self._kube_config ] out = subprocess.check_output(cmd) return [yaml.load(out)["data"]["cluster-external-dns-name"]] except Exception: logger.error("Failed to get cluster dns name from config map.") return None @retry(wait_exponential_multiplier=1000, stop_max_attempt_number=5) def _get_svc_eip(self, svclabel, namespace): svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace, svclabel) assert len( svc.items) == 1, "Currently services should only have one ingress" rst = [] for ig in svc.items[0].status.load_balancer.ingress: if ig.hostname: rst.append(ig.hostname) if ig.ip: rst.append(ig.ip) return rst def _set_ext_dns(self): axops_eip = self._get_eip_from_config_map() or self._get_svc_eip( svclabel="app=axops", namespace=AXNameSpaces.AXSYS) if not axops_eip: logger.error( "Platform Start Failed: cannot find External IP for AXOPS") raise AXPlatformException("AXOPS elastic IP does not exist") self.cluster_dns_name = axops_eip[0] # Don't change format of this message. Portal parses this line to get cluster IP/DNS. logger.info( "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n", COLOR_GREEN, self.cluster_dns_name, COLOR_NORM) self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name def get_cluster_external_dns(self): if not self.cluster_dns_name: self._set_ext_dns() return self.cluster_dns_name def _set_autoscaling(self): # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if asg_name is not None: self._replacing["ASG_NAME"] = asg_name else: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # TODO (#157) Version should only be uploaded during install and upgrade time def _update_version(self): # Software info we get during install / upgrade does not contain ami id # need to persist it as well self._software_info.ami_id = self._cluster_config.get_ami_id() AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id, self._aws_profile).update(self._software_info.to_dict())
class AXClusterBuckets(object): """ Bucket created in target account, same as cluster account. """ def __init__(self, name_id, aws_profile, aws_region): self._name_id = name_id self._aws_profile = aws_profile self._aws_region = aws_region self.cluster_config = AXClusterConfig(cluster_name_id=self._name_id) def update(self): logger.info("Creating and updating all cluster buckets ...") self._update_cluster_bucket() self._update_data_bucket() logger.info("Creating and updating all cluster buckets ... DONE") def delete(self): logger.info("Deleting all cluster buckets ...") self._delete_cluster_bucket() self._delete_data_bucket() logger.info("Deleting all cluster buckets ... DONE") def _update_cluster_bucket(self): bucket_name = AXClusterConfigPath(name_id=self._name_id).bucket() cluster_bucket = Cloud().get_bucket(bucket_name, aws_profile=self._aws_profile, region=self._aws_region) if not cluster_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( cluster_bucket.get_bucket_name())) logger.info("Created %s bucket ... DONE", cluster_bucket.get_bucket_name()) def _update_data_bucket(self): data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) if not data_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( data_bucket.get_bucket_name())) if self.cluster_config.get_cluster_provider() != ClusterProvider.USER: # Update CORS config for data bucket too. logger.info("Checking CORS config for %s.", data_bucket.get_bucket_name()) data_bucket.put_cors(DATA_CORS_CONFIG) logger.info("Created %s bucket ... DONE", data_bucket.get_bucket_name()) def _delete_cluster_bucket(self): logger.info( "Deleting applatix-cluster bucket contents for cluster %s ...", self._name_id) cluster_bucket = Cloud().get_bucket( AXClusterConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) idobj = AXClusterId(name=self._name_id) cluster_config_path = AXClusterConfigPath(name_id=self._name_id) cluster_name = idobj.get_cluster_name() prefix = cluster_name + "/" # TODO: Not idempotent here. # Consider the following case: if there is exception thrown when deleting S3 objects, install stage 1 # information has already been deleted but not everything are successfully deleted, the next time user # executes "delete", this program will assume install stage 1 has been cleaned up. exempt = [ idobj.get_cluster_id_s3_key(), cluster_config_path.cluster_install_stage0_key() ] logger.info( "Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, cluster_bucket.get_bucket_name()) cluster_bucket.delete_all(obj_prefix=prefix, exempt=exempt) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, cluster_bucket.get_bucket_name()) logger.info("Deleting stage0 information ...") for item in exempt: cluster_bucket.delete_object(item) logger.info("Deleting stage0 information ... DONE") def _delete_data_bucket(self): logger.info( "Deleting applatix-data bucket contents for cluster %s ...", self._name_id) data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) cluster_name = AXClusterId(name=self._name_id).get_cluster_name() prefix = cluster_name + "/" logger.info( "Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, data_bucket.get_bucket_name()) data_bucket.delete_all(obj_prefix=prefix) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, data_bucket.get_bucket_name())
class AXSYSKubeYamlUpdater(object): """ This class loads a kubernetes yaml file, updates resource, and generate objects that kube_object.py can consume """ def __init__(self, config_file_path): assert os.path.isfile( config_file_path), "Config file {} is not a file".format( config_file_path) self._config_file = config_file_path self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id) if not self._cluster_config.get_cluster_provider().is_user_cluster(): self.cpu_mult, self.mem_mult, self.disk_mult, \ self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers() else: self.cpu_mult = 1 self.mem_mult = 1 self.disk_mult = 1 self.daemon_cpu_mult = 1 self.daemon_mem_mult = 1 self._swagger_components = [] self._yaml_components = [] self._updated_raw = "" # TODO: when we support config software info using a config file, need to figure out how that # file gets passed through, since SoftwareInfo is not a singleton self._software_info = SoftwareInfo() self._load_objects() self._load_raw() @property def updated_raw(self): return self._updated_raw @property def components_in_dict(self): return self._yaml_components @property def components_in_swagger(self): return self._swagger_components def _load_objects(self): with open(self._config_file, "r") as f: data = f.read() for c in yaml.load_all(data): swagger_obj = self._config_yaml(c) yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj) self._swagger_components.append(swagger_obj) self._yaml_components.append(yaml_obj) def _load_raw(self): self._updated_raw = yaml.dump_all(self._yaml_components) def _get_resource_multipliers(self): """ Resources in yaml templates need to be multiplied with these numbers :return: cpu_multiplier, mem_multiplier, disk_multiplier """ # Getting cluster size from cluster config, in order to configure resources # There are 3 situations we will be using AXClusterConfig # - During install, since the class is a singleton, it has all the values we need # no need to download from s3 # - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download # cluster config files from s3 to get the values # - During job creation: the node axmon runs has the proper roles to access s3 try: ax_node_max = int(self._cluster_config.get_asxys_node_count()) ax_node_type = self._cluster_config.get_axsys_node_type() usr_node_max = int( self._cluster_config.get_max_node_count()) - ax_node_max usr_node_type = self._cluster_config.get_axuser_node_type() assert all( [ax_node_max, ax_node_type, usr_node_max, usr_node_type]) except Exception as e: logger.error( "Unable to read cluster config, skip resource config for %s. Error %s", self._config_file, e) return 1, 1, 1, 1, 1 rc = AXSYSResourceConfig( ax_node_type=ax_node_type, ax_node_max=ax_node_max, usr_node_type=usr_node_type, usr_node_max=usr_node_max, cluster_type=self._cluster_config.get_ax_cluster_type()) #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)", # ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file, # rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, # rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier) return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier def _config_yaml(self, kube_yaml_obj): """ Load dict into swagger object, patch resource, sanitize, return a dict :param kube_yaml_obj: :return: swagger object with resource values finalized """ kube_kind = kube_yaml_obj["kind"] (swagger_class_literal, swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind] swagger_obj = ApiClient()._ApiClient__deserialize( kube_yaml_obj, swagger_class_literal) assert isinstance(swagger_obj, swagger_instance), \ "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance) if isinstance(swagger_obj, V1beta1Deployment): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None node_selector = swagger_obj.spec.template.spec.node_selector if node_selector and node_selector.get('ax.tier', 'applatix') == 'master': # Skip updating containers on master. logger.info( "Skip updating cpu, mem multipliers for pods on master: %s", swagger_obj.metadata.name) else: for container in swagger_obj.spec.template.spec.containers: self._update_container(container) return swagger_obj elif isinstance(swagger_obj, V1Pod): if not self._software_info.registry_is_private(): swagger_obj.spec.image_pull_secrets = None return swagger_obj elif isinstance(swagger_obj, V1beta1DaemonSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None for container in swagger_obj.spec.template.spec.containers: # We are special-casing applet DaemonSet to compromise the fact that # we are using different node type for compute-intense nodes if swagger_obj.metadata.name == "applet": self._update_container(container=container, is_daemon=True, update_resource=True) else: self._update_container(container=container, is_daemon=True, update_resource=False) return swagger_obj elif isinstance(swagger_obj, V1beta1StatefulSet): if not self._software_info.registry_is_private(): swagger_obj.spec.template.spec.image_pull_secrets = None return self._update_statefulset(swagger_obj) elif isinstance(swagger_obj, V1PersistentVolumeClaim): self._update_volume(swagger_obj) return swagger_obj else: # logger.info("Object %s does not need to configure resource", type(swagger_obj)) # HACK, as the original hook will be messed up if isinstance(swagger_obj, V1Service): if swagger_obj.metadata.name == "axops": swagger_obj.spec.load_balancer_source_ranges = [] if self._cluster_config and self._cluster_config.get_trusted_cidr( ): for cidr in self._cluster_config.get_trusted_cidr(): # Seems swagger client does not support unicode ... SIGH swagger_obj.spec.load_balancer_source_ranges.append( str(cidr)) # HACK #2: if we don't do this, kubectl will complain about something such as # # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z) # # p.target_port is defined as string though, but if its really a string, kubectl # is looking for a port name, rather than a number # SIGH ... for p in swagger_obj.spec.ports or []: try: p.target_port = int(p.target_port) except (ValueError, TypeError): pass return swagger_obj def _update_deployment_or_daemonset(self, kube_obj): assert isinstance(kube_obj, V1beta1Deployment) or isinstance( kube_obj, V1beta1DaemonSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) return kube_obj def _update_statefulset(self, kube_obj): assert isinstance(kube_obj, V1beta1StatefulSet) for container in kube_obj.spec.template.spec.containers: self._update_container(container) if isinstance(kube_obj.spec.volume_claim_templates, list): for vol in kube_obj.spec.volume_claim_templates: self._update_volume(vol) return kube_obj def _update_container(self, container, is_daemon=False, update_resource=True): assert isinstance(container, V1Container) if update_resource: cpulim = container.resources.limits.get("cpu") memlim = container.resources.limits.get("memory") cpureq = container.resources.requests.get("cpu") memreq = container.resources.requests.get("memory") def _massage_cpu(orig): return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult def _massage_mem(orig): return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult if cpulim: rvc = ResourceValueConverter(value=cpulim, target="cpu") rvc.massage(_massage_cpu) container.resources.limits["cpu"] = "{}m".format( rvc.convert("m")) if cpureq: rvc = ResourceValueConverter(value=cpureq, target="cpu") rvc.massage(_massage_cpu) container.resources.requests["cpu"] = "{}m".format( rvc.convert("m")) if memlim: rvc = ResourceValueConverter(value=memlim, target="memory") rvc.massage(_massage_mem) container.resources.limits["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if memreq: rvc = ResourceValueConverter(value=memreq, target="memory") rvc.massage(_massage_mem) container.resources.requests["memory"] = "{}Mi".format( int(rvc.convert("Mi"))) if container.liveness_probe and container.liveness_probe.http_get: try: container.liveness_probe.http_get.port = int( container.liveness_probe.http_get.port) except (ValueError, TypeError): pass if container.readiness_probe and container.readiness_probe.http_get: try: container.readiness_probe.http_get.port = int( container.readiness_probe.http_get.port) except (ValueError, TypeError): pass # Add resource multiplier to containers in case we need them if not container.env: container.env = [] container.env += self._generate_default_envs(is_daemon, update_resource) def _update_volume(self, vol): assert isinstance(vol, V1PersistentVolumeClaim) vol_size = vol.spec.resources.requests["storage"] def _massage_disk(orig): return orig * self.disk_mult if vol_size: rvc = ResourceValueConverter(value=vol_size, target="storage") rvc.massage(_massage_disk) # Since AWS does not support value such as 1.5G, lets round up to its ceil vol.spec.resources.requests["storage"] = "{}Gi".format( int(ceil(rvc.convert("Gi")))) # Manually patch access mode as swagger client mistakenly interprets this as map vol.spec.access_modes = ["ReadWriteOnce"] def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] aws_region = os.environ.get("AX_AWS_REGION", "") if aws_region != "": default_envs.append({"name": "AX_AWS_REGION", "value": aws_region}) if os.getenv("ARGO_S3_ACCESS_KEY_ID", "") != "": # Secrets default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_ID", "secret": "argo-access-key" }) default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_SECRET", "secret": "argo-secret-key" }) default_envs.append({ "name": "ARGO_S3_ENDPOINT", "value": os.getenv("ARGO_S3_ENDPOINT", None) }) # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src elif d.get("secret", None): secret = V1SecretKeySelector() secret.key = d["secret"] secret.name = d["secret"] src = V1EnvVarSource() src.secret_key_ref = secret var.value_from = src else: var.value = d["value"] rst.append(var) return rst
class Deployment(object): """ This class creates and manages a single deployment object A deployment consists of the following specifications in kubernetes 1. A kubernetes deployment spec 2. Zero or more kubernetes service specs 3. Zero or more ingress rules All functions in the object need to be idempotent. """ def __init__(self, name, application): """ Each deployment has a name and needs to be part of an application Application maps to a kubernetes namespace and the deployment will be created in this namespace. Args: name: deployment name application: the application that this deployment runs under """ self.name = name self.application = application self.client = KubernetesApiClient(use_proxy=True) self._nameid = AXClusterId().get_cluster_name_id() self._software_info = SoftwareInfo() self._app_obj = Application(application) self._resources = AXResources() self.spec = None self._cluster_config = AXClusterConfig() def create(self, spec): """ Create a deployment from the template specified Idempotency: This function is idempotent. A create of identical spec will have no impact if the deployment already exists. If the spec is different then the existing deployment will be updated. """ @retry_unless(status_code=[404, 422]) def create_in_provider(k8s_spec): try: logger.info( "Creating deployment %s in Kubernetes namespace %s", self.name, self.application) self.client.apisappsv1beta1_api.create_namespaced_deployment( k8s_spec, self.application) logger.info( "Done creating deployment %s in Kubernetes namespace %s", self.name, self.application) except swagger_client.rest.ApiException as e: if e.status == 409: self.client.apisappsv1beta1_api.replace_namespaced_deployment( k8s_spec, self.application, self.name) else: raise e with DeploymentOperation(self): self.spec = spec # Do some template checks self._template_checks() # First create supplemental resources such as routes, ingress rules etc self._create_deployment_resources() # Now create the deployment spec d_spec = self._create_deployment_spec() # Store the resources in the deployment spec self._resources.finalize(d_spec) # Create the deployment object in kubernetes create_in_provider(d_spec) def delete(self, timeout=None): """ Delete the deployment. Idempotency: This function is idempotent. If deployment does not exist then delete will silently fail without raising any exceptions. Args: timeout: In seconds or None for infinite """ options = swagger_client.V1DeleteOptions() options.grace_period_seconds = 1 options.orphan_dependents = False def check_result(result): # True for retry False for done return not result @retry(retry_on_result=check_result, wait_fixed=2000, stop_max_delay=timeout) def wait_for_scale_to_zero(): logger.debug("Wait for scale of deployment to 0 for {} {}".format( self.application, self.name)) @retry_unless(swallow_code=[404]) def get_scale_from_provider(): return self.client.apisappsv1beta1_api.read_namespaced_scale_scale( self.application, self.name) scale = get_scale_from_provider() if scale is None: return True if scale.status.replicas == 0: return True return False @retry_unless(swallow_code=[404, 409]) def delete_in_provider(): logger.debug("Deleting deployment for {} {}".format( self.application, self.name)) self.client.apisappsv1beta1_api.delete_namespaced_deployment( options, self.application, self.name) def delete_rs_in_provider(): logger.debug("Deleting replica set for {} {}".format( self.application, self.name)) self.client.extensionsv1beta1.deletecollection_namespaced_replica_set( self.application, label_selector="deployment={}".format(self.name)) # now delete deployment object and replication set with DeploymentOperation(self): dep_obj = self._deployment_status() self._scale_to(0) wait_for_scale_to_zero() if dep_obj: resources = AXResources(existing=dep_obj) resources.delete_all() delete_in_provider() delete_rs_in_provider() def status(self): """ Get the status of the deployment. Returns: Returns the entire V1Deployment as a dict. If deployment is not found then this will raise an AXNotFoundException (404) """ # STEP 1: Get status of deployment stat = self._deployment_status() if stat is None: raise AXNotFoundException( "Deployment {} not found in application {}".format( self.name, self.application)) dep_field_map = { "name": "metadata.name", "generation": "metadata.annotations.ax_generation", "desired_replicas": "status.replicas", "available_replicas": "status.available_replicas", "unavailable_replicas": "status.unavailable_replicas" } ret = KubeObject.swagger_obj_extract(stat, dep_field_map, serializable=True) # STEP 2: Get the pods for the deployment and events associated podlist = self._deployment_pods().items dep_events = self._app_obj.events(name=self.name) event_field_map = { "message": "message", "reason": "reason", "source": "source.component", "host": "source.host", "firstTS": "first_timestamp", "lastTS": "last_timestamp", "count": "count", "container": "involved_object.field_path", "type": "type" } ret["events"] = [] for event in dep_events: ret["events"].append( KubeObject.swagger_obj_extract(event, event_field_map, serializable=True)) ret["pods"] = [] for pod in podlist or []: # fill pod status and containers pstatus = Pod.massage_pod_status(pod) # fill events for pod pstatus["events"] = [] events = self._app_obj.events(name=pod.metadata.name) for event in events: pstatus["events"].append( KubeObject.swagger_obj_extract(event, event_field_map, serializable=True)) # fill pod failure information for pod based on events pstatus["failure"] = Deployment._pod_failed_info(pstatus) ret["pods"].append(pstatus) # STEP 3: From the deployment spec get the resources created by deployment # TODO: Add this when services are created by deployment return ret def get_labels(self): """ Get a dict of labels used for this deployment """ state = self._deployment_status() if state is None: raise AXNotFoundException( "Did not find deployment {} in application {}".format( self.name, self.application)) return KubeObject.swagger_obj_extract( state, {"labels": "spec.selector.match_labels"})['labels'] @staticmethod def _pod_failed_info(pod_status): if pod_status["phase"] != "Pending": return None for ev in pod_status["events"] or []: if ev["reason"] == "Failed" and ev["source"] == "kubelet" and ev["type"] == "Warning" and \ "Failed to pull image" in ev["message"] and ev["count"] > 5: return {"reason": "ImagePullFailure", "message": ev["message"]} return None def scale(self, replicas): with DeploymentOperation(self): # Deployments with volumes can't be scaled to > 1. if replicas > 1: dep_obj = self._deployment_status() if dep_obj: resources = AXResources(existing=dep_obj) for type in resources.get_all_types(): if type.startswith("ax.platform.volumes"): raise AXApiForbiddenReq( "Deployments with volumes can't be scaled to > 1 ({})" .format(replicas)) self._scale_to(replicas) @retry_unless(swallow_code=[404]) def _deployment_status(self): return self.client.apisappsv1beta1_api.read_namespaced_deployment( self.application, self.name) @retry_unless(swallow_code=[404]) def _deployment_pods(self): return self.client.api.list_namespaced_pod( self.application, label_selector="deployment={}".format(self.name)) def _create_deployment_spec(self): pod_spec = PodSpec(self.name, namespace=self.application) main_container = self.spec.template.get_main_container() main_container_spec = self._create_main_container_spec(main_container) pod_spec.add_main_container(main_container_spec) container_vols = self._get_main_container_vols() main_container_spec.add_volumes(container_vols) hw_res = main_container.get_resources() main_container_spec.add_resource_constraints("cpu_cores", hw_res.cpu_cores, limit=None) main_container_spec.add_resource_constraints("mem_mib", hw_res.mem_mib, limit=None) artifacts_container = pod_spec.enable_artifacts( self._software_info.image_namespace, self._software_info.image_version, None, main_container.to_dict()) secret_resources = artifacts_container.add_configs_as_vols( main_container.get_all_configs(), self.name, self.application) self._resources.insert_all(secret_resources) # Set up special circumstances based on annotations # Check if we need to circumvent the executor script. This is needed for containers that run # special init processes such as systemd as these processes like to be pid 1 if main_container.executor_spec: main_container_spec.command = None if main_container.docker_spec is not None: raise ValueError( "We do not support ax_ea_docker_enable with ax_ea_executor" ) # Does this container need to be privileged main_container_spec.privileged = main_container.privileged # Check if docker daemon sidecar needs to be added if main_container.docker_spec: # graph storage size is specified in GiB dind_container_spec = pod_spec.enable_docker( main_container.docker_spec.graph_storage_size_mib) dind_container_spec.add_volumes(pod_spec.get_artifact_vols()) dind_container_spec.add_resource_constraints( "cpu_cores", main_container.docker_spec.cpu_cores, limit=None) dind_container_spec.add_resource_constraints( "mem_mib", main_container.docker_spec.mem_mib, limit=None) dind_container_spec.add_volumes(container_vols) # Do we only need docker graph storage volume for the main container if main_container.graph_storage: dgs_vol = ContainerVolume("graph-storage-vol-only", main_container.graph_storage.mount_path) dgs_vol.set_type( "DOCKERGRAPHSTORAGE", main_container.graph_storage.graph_storage_size_mib) main_container_spec.add_volume(dgs_vol) # set the pod hostname to value provided in main container spec pod_spec.hostname = main_container.hostname # TODO: This needs fixup. job name is used in init container to ask permission to start # TODO: Don't know if this is needed in deployment or not? artifacts_container.add_env("AX_JOB_NAME", value=self.application) artifacts_container.add_env("AX_DEPLOYMENT_NEW", value="True") if len(container_vols) > 0: tmp_container_vols = copy.deepcopy(container_vols) volume_paths = [] for v in tmp_container_vols: v.set_mount_path("/ax/fix" + v.volmount.mount_path) volume_paths.append(v.volmount.mount_path) artifacts_container.add_volumes(tmp_container_vols) logger.info("Volumes to chmod: %s", volume_paths) artifacts_container.add_env("AX_VOL_MOUNT_PATHS", value=str(volume_paths)) # add annotation for service env which will show up in artifacts container pod_spec.add_annotation("AX_SERVICE_ENV", self._generate_service_env(self.spec.template)) pod_spec.add_annotation("AX_IDENTIFIERS", self._get_identifiers()) if self.spec.costid: pod_spec.add_annotation("ax_costid", json.dumps(self.spec.costid)) pod_spec.add_label("deployment", self.name) pod_spec.add_label("application", self.application) pod_spec.add_label("tier", "user") pod_spec.add_label("deployment_id", self.spec.id) # now that pod is ready get its spec and wrap it in a deployment k8s_spec = self._generate_deployment_spec_for_pod(pod_spec.get_spec()) logger.info("Generated Kubernetes spec for deployment %s", self.name) return k8s_spec def _create_main_container_spec(self, container_template): """ :type container_template: argo.template.v1.container.ContainerTemplate :rtype Container """ logger.debug("Container template is {}".format(container_template)) name = string_to_dns_label(container_template.name) container_spec = Container( name, container_template.image, pull_policy=container_template.image_pull_policy) container_spec.parse_probe_spec(container_template) # Necessary envs for handshake container_spec.add_env("AX_HANDSHAKE_VERSION", value=CUR_RECORD_VERSION) # Envs introduced to user container_spec.add_env("AX_POD_NAME", value_from="metadata.name") container_spec.add_env("AX_POD_IP", value_from="status.podIP") container_spec.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace") container_spec.add_env("AX_NODE_NAME", value_from="spec.nodeName") container_spec.add_env("AX_CLUSTER_META_URL_V1", value=CLUSTER_META_URL_V1) # envs from user spec for env in container_template.env: (cfg_ns, cfg_name, cfg_key) = env.get_config() if cfg_ns is not None: secret = SecretResource(cfg_ns, cfg_name, self.name, self.application) secret.create() self._resources.insert(secret) container_spec.add_env( env.name, value_from_secret=(secret.get_resource_name(), cfg_key)) else: container_spec.add_env(env.name, value=env.value) # Unix socket for applet applet_sock = ContainerVolume("applet", "/tmp/applatix.io/") applet_sock.set_type("HOSTPATH", "/var/run/") container_spec.add_volume(applet_sock) return container_spec @staticmethod def _get_valid_name_from_axrn(axrn): # AXRN's will have non-alphanumeric characters such as : / @, etc which K8S doesn't # like in its PVC name. Replace all non-alphanumeric characters with -. name_regex = re.compile(r"\W+") return name_regex.sub("-", axrn).replace("_", "-") def _get_main_container_vols(self): container_template = self.spec.template.get_main_container() ret = [] for vol_name, vol in iteritems(container_template.inputs.volumes): # sanitize the volume name for kubernetes vol_name = string_to_dns_label(vol_name) cvol = ContainerVolume(vol_name, vol.mount_path) assert "resource_id" in vol.details, "Volume resource-id absent in volume details" assert "filesystem" in vol.details, "Volume filesystem absent in volume details" cvol.set_type("AWS_EBS", vol_name, vol.details["resource_id"], vol.details["filesystem"]) logger.debug("Volume {} {} mounted at {}".format( vol_name, vol.details, vol.mount_path)) ret.append(cvol) return ret def _generate_service_env(self, template): return base64.b64encode(json.dumps(template.to_dict())) def _get_identifiers(self): return { "application_id": self.spec.app_generation, "deployment_id": self.spec.id, "static": { "application_id": self.spec.app_id, "deployment_id": self.spec.deployment_id } } def _generate_deployment_spec_for_pod(self, pod_spec): metadata = swagger_client.V1ObjectMeta() metadata.name = self.name dspec = swagger_client.V1beta1DeploymentSpec() dspec.strategy = self._get_strategy() if self.spec.template.min_ready_seconds: dspec.min_ready_seconds = self.spec.template.min_ready_seconds dspec.selector = swagger_client.V1LabelSelector() dspec.selector.match_labels = {"deployment": self.name} dspec.replicas = self.spec.template.scale.min dspec.template = pod_spec deployment_obj = swagger_client.V1beta1Deployment() deployment_obj.metadata = metadata deployment_obj.spec = dspec return deployment_obj def _create_deployment_resources(self): for route in self.spec.template.internal_routes: # ignore empty port spec if len(route.ports) == 0: logger.debug( "Skipping internal route {} as port spec is empty".format( route.name)) continue ir = InternalRoute(route.name, self.application) ir.create(route.to_dict()["ports"], selector={"deployment": self.name}, owner=self.name) self._resources.insert(ir) logger.debug("Created route {}".format(ir)) for route in self.spec.template.external_routes: dns_name = route.dns_name() if dns_name.endswith("."): dns_name = dns_name[:-1] r = ExternalRoute(dns_name, self.application, {"deployment": self.name}, route.target_port, route.ip_white_list, route.visibility) elb_addr = None elb_name = None if not self._cluster_config.get_cluster_provider().is_user_cluster( ): try: elb_addr = visibility_to_elb_addr(route.visibility) elb_name = visibility_to_elb_name(route.visibility) except AXNotFoundException: if route.visibility == ExternalRouteVisibility.VISIBILITY_WORLD: raise AXNotFoundException( "Could not find the public ELB. Please report this error to Applatix Support at [email protected]" ) else: assert route.visibility == ExternalRouteVisibility.VISIBILITY_ORGANIZATION, "Only world and organization are currently supported as visibility attributes" raise AXNotFoundException( "Please create a private ELB using the template named 'ax_private_elb_creator_workflow' before using 'visibility=organization'" ) name = r.create(elb_addr, elb_name=elb_name) self._resources.insert(r) logger.debug("Created external route {} for {}/{}/{}".format( name, self.application, self.name, dns_name)) main_container = self.spec.template.get_main_container() for key_name, vol in iteritems(main_container.inputs.volumes): assert "resource_id" in vol.details, "Volume resource_id absent in volume details" name = vol.details.get("axrn", None) resource_id = vol.details.get("resource_id", None) assert name is not None and resource_id is not None, "axrn and resource_id are required details for volume {}".format( key_name) nv_res = AXNamedVolumeResource(name, resource_id) nv_res.create() self._resources.insert(nv_res) logger.debug( "Using named volume resource {} in application {}".format( name, self.application)) @retry_unless(status_code=[422], swallow_code=[400, 404]) def _scale_to(self, replicas): logger.debug("Scaling deployment to {} for {} {}".format( replicas, self.application, self.name)) scale = swagger_client.V1beta1Scale() scale.spec = swagger_client.V1beta1ScaleSpec() scale.spec.replicas = replicas scale.metadata = swagger_client.V1ObjectMeta() scale.metadata.name = self.name scale.metadata.namespace = self.application self.client.apisappsv1beta1_api.replace_namespaced_scale_scale( scale, self.application, self.name) def _template_checks(self): if self.spec.template.scale and self.spec.template.scale.min > 1 and len( self.spec.template.volumes) >= 1: raise ValueError( "Deployments with volumes can't have scale > 1 ({})".format( self.spec.template.scale.min)) def _get_strategy(self): s = swagger_client.V1beta1DeploymentStrategy() s.type = "RollingUpdate" if self.spec.template.strategy.type == "rolling_update" else "Recreate" if s.type == "RollingUpdate": rolling_update = swagger_client.V1beta1RollingUpdateDeployment() rolling_update.max_unavailable = self.spec.template.strategy.rolling_update.max_unavailable rolling_update.max_surge = self.spec.template.strategy.rolling_update.max_surge s.rolling_update = rolling_update return s