Exemple #1
0
    def __init__(self, name, client=None):
        self.name = name
        if client is None:
            self._client = KubernetesApiClient(use_proxy=True)
        else:
            self._client = client

        self._registry_spec = None
        self._software_info = SoftwareInfo()
        if self._software_info.registry_is_private():
            secret = KubeObjectConfigFile(DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets})
            for obj in secret.get_swagger_objects():
                if isinstance(obj, swagger_client.V1Secret):
                    self._registry_spec = obj
            assert self._registry_spec, "Argo registry specification is missing"

        self._am_service_spec = None
        self._am_deployment_spec = None

        # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager
        elb = InternalRoute("axops", "axsys", client=self._client)
        elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0]
        if not elb_status:
            raise AXPlatformException("Could not get axops elb address {}".format(elb_status))

        replacements = {"NAMESPACE": self._software_info.image_namespace,
                        "VERSION": self._software_info.image_version,
                        "REGISTRY": self._software_info.registry,
                        "APPLICATION_NAME": self.name,
                        "AXOPS_EXT_DNS": elb_status}
        cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None)
        assert cluster_name_id, "Cluster name id is None!"
        cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id)
        if not cluster_config.get_cluster_provider().is_user_cluster():
            axam_path = DEFAULT_AM_YAML_PATH
        else:
            axam_path = "/ax/config/service/argo-all/axam-svc.yml.in"
            replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv("ARGO_DATA_BUCKET_NAME")

        logger.info("Using replacements: %s", replacements)

        k = KubeObjectConfigFile(axam_path, replacements)
        for obj in k.get_swagger_objects():
            if isinstance(obj, swagger_client.V1Service):
                self._am_service_spec = obj
            elif isinstance(obj, swagger_client.V1beta1Deployment):
                self._am_deployment_spec = obj
                self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True)
                self._add_pod_metadata("ax_costid", json.dumps({
                    "app": self.name,
                    "service": "axam-deployment",
                    "user": "******"
                }))
            else:
                logger.debug("Ignoring specification of type {}".format(type(obj)))
        assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
Exemple #2
0
    def get_spec(self):

        # generate the metadata
        metadata = swagger_client.V1ObjectMeta()
        metadata.name = self.name
        metadata.annotations = {
            "pod.beta.kubernetes.io/init-containers":
            self._init_containers_spec()
        }
        for a in self.annotations:
            metadata.annotations[a] = self.annotations[a]

        metadata.labels = {}
        for l in self.labels:
            metadata.labels[l] = self.labels[l]

        # generate the pod specification
        pspec = swagger_client.V1PodSpec()
        if self.hostname:
            pspec.hostname = self.hostname
        pspec.containers = []

        if "wait" in self.cmap:
            pspec.containers.append(self.cmap["wait"].generate_spec())

        assert "main" in self.cmap, "Pod specification cannot be generated without a main container"
        pspec.containers.append(self.cmap["main"].generate_spec())

        if "dind" in self.cmap:
            pspec.containers.append(self.cmap["dind"].generate_spec())

        pspec.image_pull_secrets = self._build_image_pull_secrets()
        pspec.volumes = self._volume_spec()

        if self.restart_policy is not None:
            pspec.restart_policy = self.restart_policy

        cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None)
        assert cluster_name_id, "Cluster name id is None!"
        cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id)
        if not cluster_config.get_cluster_provider().is_user_cluster():
            pspec.node_selector = {"ax.tier": self._tier}

        # finalize the pod template spec
        spec = swagger_client.V1PodTemplateSpec()
        spec.metadata = metadata
        spec.spec = pspec

        return spec
Exemple #3
0
class AXPlatform(object):
    def __new__(cls, *args, **kwargs):
        if Cloud().target_cloud_gcp():
            from .gke_platform import AXGKEPlatform
            return super(AXPlatform, cls).__new__(AXGKEPlatform)
        else:
            return super(AXPlatform, cls).__new__(cls)

    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}

    def _load_kube_objects_from_steps(self, steps):
        """
        Extract kube objects from steps in config, and load them into memory
        :param steps: list
        :return:
        """
        for object_group in steps:
            assert isinstance(object_group, AXPlatformObjectGroup)
            for obj in object_group.object_set:
                assert isinstance(obj, AXPlatformObject)
                name = obj.name
                filename = obj.manifest
                namespace = obj.namespace
                if name in self._kube_objects:
                    raise ValueError("Duplicated object name {}".format(name))
                kubeobj_conf_path = os.path.join(self._manifest_root, filename)
                self._kube_objects[name] = KubeObject(
                    config_file=kubeobj_conf_path,
                    kubepoll=self.kube_poll,
                    replacing=None,
                    kube_config=self._kube_config,
                    kube_namespace=namespace)

    def _get_trusted_cidr_str(self):
        trusted_cidr = self._cluster_config.get_trusted_cidr()
        if isinstance(trusted_cidr, list):
            trusted_cidr_str = "["
            for cidr in trusted_cidr:
                trusted_cidr_str += "\"{}\",".format(str(cidr))
            trusted_cidr_str = trusted_cidr_str[:-1]
            trusted_cidr_str += "]"
        else:
            trusted_cidr_str = "[{}]".format(trusted_cidr)
        return trusted_cidr_str

    def _generate_replacing_for_user_provisioned_cluster(self):
        trusted_cidr_str = self._get_trusted_cidr_str()
        self._persist_node_resource_rsvp(0, 0)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "ARGO_DATA_BUCKET_NAME":
            AXClusterConfigPath(self._cluster_name_id).bucket(),
            "LOAD_BALANCER_TYPE":
            "LoadBalancer",
            "ARGO_S3_ACCESS_KEY_ID":
            base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_ID", "")),
            "ARGO_S3_ACCESS_KEY_SECRET":
            base64.b64encode(os.getenv("ARGO_S3_ACCESS_KEY_SECRET", "")),
        }

    def _generate_replacing(self):
        # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method
        # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and
        # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid
        # yaml files, and therefore we construct string manually here
        trusted_cidr_str = self._get_trusted_cidr_str()
        axsys_cpu = 0
        axsys_mem = 0
        daemon_cpu = 0
        daemon_mem = 0
        for name in self._kube_objects.keys():
            cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage
            axsys_cpu += cpu
            axsys_mem += mem
            daemon_cpu += dcpu
            daemon_mem += dmem

        # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not
        # have a memory request, but this is an approximation)
        daemon_cpu += 100
        daemon_mem += 100

        logger.info(
            "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi",
            axsys_cpu, axsys_mem, daemon_cpu, daemon_mem)

        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = str(
            int(self._cluster_config.get_min_node_count()) - axsys_node_count)
        axuser_max_count = str(
            int(self._cluster_config.get_max_node_count()) - axsys_node_count)
        autoscaler_scan_interval = str(
            self._cluster_config.get_autoscaler_scan_interval())

        usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["cpu"]
        usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["memory"]
        scale_down_util_thresh = round(
            max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001
        logger.info("Setting node scale down utilization threshold to %s",
                    scale_down_util_thresh)

        self._persist_node_resource_rsvp(daemon_cpu, daemon_mem)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if not asg_name:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

        # Prepare minion-manager.
        spot_instances_option = self._cluster_config.get_spot_instances_option(
        )
        minion_manager_asgs = ""
        if spot_instances_option == SpotInstanceOption.ALL_SPOT:
            for asg in asg_manager.get_all_asgs():
                minion_manager_asgs = minion_manager_asgs + asg[
                    "AutoScalingGroupName"] + " "
            minion_manager_asgs = minion_manager_asgs[:-1]
        elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT:
            minion_manager_asgs = asg_manager.get_variable_asg(
            )["AutoScalingGroupName"]

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "ASG_MIN":
            axuser_min_count,
            "ASG_MAX":
            axuser_max_count,
            "AUTOSCALER_SCAN_INTERVAL":
            autoscaler_scan_interval,
            "SCALE_DOWN_UTIL_THRESH":
            str(scale_down_util_thresh),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "ASG_NAME":
            asg_name,
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "AX_ENABLE_SPOT_INSTANCES":
            str(spot_instances_option != SpotInstanceOption.NO_SPOT),
            "AX_SPOT_INSTANCE_ASGS":
            minion_manager_asgs,
        }

    def _persist_node_resource_rsvp(self, user_node_daemon_cpu,
                                    user_node_daemon_mem):
        self._cluster_config.set_user_node_resource_rsvp(
            cpu=user_node_daemon_cpu, mem=user_node_daemon_mem)
        self._cluster_config.save_config()

    def start(self):
        """
        Bring up platform using "platform-start.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects
        steps = self._config.steps
        self._load_kube_objects_from_steps(steps)

        if self._cluster_config.get_cluster_provider() != ClusterProvider.USER:
            self._replacing = self._generate_replacing()
        else:
            self._replacing = self._generate_replacing_for_user_provisioned_cluster(
            )

        logger.debug("Replacing ENVs: %s", self._replacing)

        # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32)
        # At this moment, we MUST separate first step due to the above dependency
        assert len(steps) >= 2, "Should have at least 1 step to create axops"
        self.create_objects(steps[0])
        self.create_objects(steps[1])
        self.create_objects(steps[2])

        # Prepare axops_eip
        if self._cluster_config.get_provider() != "minikube":
            self._set_ext_dns()

        info_bound = "=======================================================\n"
        img_namespace = "Image Namespace: {}\n".format(
            self._software_info.image_namespace)
        img_version = "Image Version: {}\n".format(
            self._software_info.image_version)
        start_info = "\n\n{}{}{}{}{}".format(
            info_bound, "Platform Up: Bringing up Argo services...\n",
            img_namespace, img_version, info_bound)
        logger.info(start_info)

        # Start rest of the objects
        for i in range(3, len(steps)):
            self.create_objects(steps[i])

        # update application namespace
        logger.info("Updating application managers")
        for app in Applications(client=self.kubectl).list():
            logger.info("--- updating {}".format(app))
            a = Application(app, client=self.kubectl)
            a.create(force_recreate=True)
        logger.info("Done updating application managers")

        # Upload version information to target cluster
        self._update_version()
        logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n",
                    COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name,
                    COLOR_NORM)

    def stop(self):
        """
        Bring down platform using "platform-stop.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects (Does not need to generate replacing during platform down)
        # Stop order should be the reverse of start
        steps = self._config.steps
        steps.reverse()
        self._load_kube_objects_from_steps(steps)

        info_bound = "=======================================================\n"
        stop_info = "\n\n{}{}{}".format(
            info_bound, "Platform Down: Shutting down Argo services...\n",
            info_bound)
        logger.info(stop_info)

        # Bring down objects according to steps
        for i in range(len(steps)):
            object_group = steps[i]
            self.delete_objects(object_group)

    def stop_monitor(self):
        self._monitor.stop()

    def create_objects(self, objects):
        """
        Start kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        if objects is None or len(objects.object_set) == 0:
            return

        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_create_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate,
                consistency=objects.consistency):
            logger.debug(
                "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)",
                objects.name, objects.policy, objects.policy_predicate,
                objects.consistency)
            return
        logger.info("Create step: %s", objects.name)
        logger.info("Creating platform objects\n\n%s",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.start_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Create")
        logger.info(report)

        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_create_group(self, policy, policy_predicate, consistency):
        """
        Take AXPlatformObjectGroup policy, predicate and consistency and determine
        if this group should be created or not
        :param policy:
        :param policy_predicate:
        :param consistency:
        :return:
        """
        # Since we are not using consistency, we should always create if not
        # explicitly told not to, i.e. if there is a PrivateRegistryOnly
        # We are just leaving the interface here that should create or not
        # need to be decided by policy, policy_predicate and consistency

        if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \
                not self._software_info.registry_is_private():
            return False
        return True

    def delete_objects(self, objects):
        """
        Stop kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_delete_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate):
            logger.debug(
                "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)",
                objects.name, objects.policy, objects.policy_predicate)
            return
        logger.info("Delete step: %s", objects.name)
        logger.info("Deleting platform objects\n\n%s.",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.stop_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Delete")
        logger.info(report)
        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_delete_group(self, policy, policy_predicate):
        """
        Take AXPlatformObjectGroup policy and determine if this group should be deleted or not.
        Consistency is not needed
        for deletion

        :param policy:
        :param policy_predicate:
        :return:
        """
        if policy == ObjectGroupPolicy.CreateMany:
            return True
        return False

    def start_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Creating %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]

        # Update them as there are new updates in replacing in platform start
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing

        assert isinstance(kube_obj, KubeObject)
        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }
        if kube_obj.healthy():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        # Previous platform start might fail, and might result in some componenets created
        # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing
        # object and try to create a new one
        if kube_obj.exists():
            logger.warning(
                "Object %s exists but not healthy. Deleting object for idempotency ...",
                name)
            self.stop_one(name, namespace)

        assert not kube_obj.exists(
        ), "Kubeobject {} already created but is not healthy. Not Expected".format(
            name)

        monitor_info = kube_obj.get_create_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout,
                    waiter)

            # Call kubectl create
            kube_obj.create()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                result["code"].append("{:.25s}:{}".format(
                    wait_info["name"], waiter.result))
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    logger.info("Successfully created %s with code %s.",
                                wait_info["name"], waiter.result)
                else:
                    result["failed"] = True
                    logger.error(
                        "Failed to create %s in %s with code %s. Events: %s",
                        wait_info["name"], namespace, waiter.result,
                        str(waiter.details))
                    if not self._debug:
                        logger.info("Deleting %s due to creation failure",
                                    name)
                        del_rst = self.stop_one(name, namespace)
                        result["code"] += del_rst["code"]
                        result["events"] += del_rst["events"]
                        result["duration"] = str(round(time.time() - start, 2))
                        return result

            # Poll extra if required (for Petset and Deployments with multiple replicas)
            if kube_obj.extra_poll:
                logger.info(
                    "Polling till healthy to make sure rest of components of %s are up and running ...",
                    name)
                create_rst = self._poll_till_healthy(
                    name=name,
                    kube_obj=kube_obj,
                    start_time=start,
                    poll_interval=AXPlatformConfigDefaults.
                    ObjCreateExtraPollInterval,
                    poll_max_retry=AXPlatformConfigDefaults.
                    ObjCreateExtraPollMaxRetry,
                    rst=result)
                if create_rst["failed"] and not self._debug:
                    logger.info("Deleting %s due to creation failure", name)
                    del_rst = self.stop_one(name, namespace)
                    create_rst["code"] += del_rst["code"]
                    create_rst["events"] += del_rst["events"]
                    create_rst["duration"] = str(round(time.time() - start, 2))
                return create_rst

            # Poll once to confirm all components from this Kubernetes config file exist,
            # In case there are objects in this config file cannot be monitored, i.e. svc
            # without elb. This is really not expected so we don't delete it
            if not kube_obj.healthy():
                logger.error(
                    "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.",
                    name)
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNHEALTHY))
                result["failed"] = True
                result["events"].append(
                    "Object {} created byt is not healthy".format(name))
            result["duration"] = str(round(time.time() - start, 2))

            if not result["failed"]:
                logger.info("Successfully created object %s.", name)
            return result
        else:
            # use polling
            kube_obj.create()
            create_rst = self._poll_till_healthy(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry,
                rst=result)
            if create_rst["failed"] and not self._debug:
                logger.info("Deleting %s due to creation failure", name)
                del_rst = self.stop_one(name, namespace)
                create_rst["code"] += del_rst["code"]
                create_rst["events"] += del_rst["events"]
                create_rst["duration"] = str(round(time.time() - start, 2))
            return create_rst

    @staticmethod
    def _poll_till_healthy(name, kube_obj, start_time, poll_interval,
                           poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if not kube_obj.healthy():
                trail += 1
                if trail > poll_max_retry:

                    logger.error("Failed to create KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY)
                    ]
                    rst["events"] += [
                        "Object {} creation timeout. Not healthy".format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully created %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.OK)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def stop_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Deleting %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing
        assert isinstance(kube_obj, KubeObject)

        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }

        # Don't delete if object does not exist
        if not kube_obj.exists():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        monitor_info = kube_obj.get_delete_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout,
                    waiter)

            # Call kubectl delete
            kube_obj.delete()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.DELETED))
                    logger.info("Successfully deleted %s in %s with code %s.",
                                wait_info["name"], name, result["code"])
                else:
                    result["failed"] = True
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.UNKNOWN))
                    logger.error(
                        "Failed to delete %s in %s with code %s. Events: %s",
                        wait_info["name"], name, result["code"],
                        str(waiter.details))

            # Poll once to confirm all components from this Kubenetes config file exist
            # In case there are objects in this config file cannot be monitored, i.e. svc without elb
            if kube_obj.exists():
                logger.error("Object %s deleted but still exists", name)
                result["failed"] = True
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNKNOWN))
                result["events"].append(
                    "Object {} deleted but still exists.".format(name))
            result["duration"] = str(round(time.time() - start, 2))
            logger.info("Successfully deleted %s.", name)
            return result
        else:
            # use polling
            kube_obj.delete()
            return self._poll_till_not_exists(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry,
                rst=result)

    @staticmethod
    def _poll_till_not_exists(name, kube_obj, start_time, poll_interval,
                              poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if kube_obj.exists():
                trail += 1
                if trail > poll_max_retry:
                    logger.error("Failed to delete KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN)
                    ]
                    rst["events"] += [
                        "Object {} deletion timeout. Please manually check remaining pods"
                        .format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully deleted %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def _generate_object_summary(self, objects):
        """
        :param objects: list of AXPlatformObject
        :return:
        """
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "MANIFEST", "NAMESPACE")
        report_bar = "{}\n".format("-" * 174)
        content = ""
        for obj in objects:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            filename = os.path.join(self._manifest_root, obj.manifest)
            namespace = obj.namespace
            content += "{:25s} |  {:110s} |  {:20s}\n".format(
                name, filename, namespace)

        return report_title + report_bar + content

    @staticmethod
    def _generate_report(results, operation):
        failed = False
        report_body = ""
        warnings = "\n======= WARNING EVENTS =======\n"
        for name in results.keys():
            individual_report = "{:25s} |  {:110s} |  {:20s}\n"
            individual_warning = "{name}: {events}\n\n"
            try:
                result = results[name].get()
                if result["failed"]:
                    failed = True
                code = result["code"][0]
                for c in result["code"][1:]:
                    code += " / {}".format(c)
                individual_report = individual_report.format(
                    name, code, result["duration"], 2)
                if len(result["events"]) > 0:
                    warnings += individual_warning.format(
                        name=name, events=str(result["events"]))
            except Exception as e:
                failed = True
                logger.exception(str(e))
                individual_report = individual_report.format(
                    name, "EXCEPTION", "UNKNOWN")
                warnings += individual_warning.format(name=name, events=str(e))
            report_body += individual_report

        report_head = "\n\nPlatform {} {}. Report:\n".format(
            operation, "FAILED" if failed else "SUCCESSFULLY")
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "STATUS", "TIME (sec)")
        report_bar = "{}\n".format("-" * 174)
        return "{}{}{}{}{}{}".format(
            report_head, report_title, report_bar, report_body, warnings,
            "==============================\n"), failed

    def _get_eip_from_config_map(self):
        try:
            cmd = [
                "kubectl", "get", "configmap", "cluster-dns-name", "-o",
                "yaml", "--namespace", self.kube_axsys_namespace,
                "--kubeconfig", self._kube_config
            ]
            out = subprocess.check_output(cmd)
            return [yaml.load(out)["data"]["cluster-external-dns-name"]]
        except Exception:
            logger.error("Failed to get cluster dns name from config map.")
            return None

    @retry(wait_exponential_multiplier=1000, stop_max_attempt_number=5)
    def _get_svc_eip(self, svclabel, namespace):
        svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace,
                                                  svclabel)
        assert len(
            svc.items) == 1, "Currently services should only have one ingress"
        rst = []
        for ig in svc.items[0].status.load_balancer.ingress:
            if ig.hostname:
                rst.append(ig.hostname)
            if ig.ip:
                rst.append(ig.ip)
        return rst

    def _set_ext_dns(self):
        axops_eip = self._get_eip_from_config_map() or self._get_svc_eip(
            svclabel="app=axops", namespace=AXNameSpaces.AXSYS)

        if not axops_eip:
            logger.error(
                "Platform Start Failed: cannot find External IP for AXOPS")
            raise AXPlatformException("AXOPS elastic IP does not exist")

        self.cluster_dns_name = axops_eip[0]
        # Don't change format of this message. Portal parses this line to get cluster IP/DNS.
        logger.info(
            "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n",
            COLOR_GREEN, self.cluster_dns_name, COLOR_NORM)
        self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name

    def get_cluster_external_dns(self):
        if not self.cluster_dns_name:
            self._set_ext_dns()
        return self.cluster_dns_name

    def _set_autoscaling(self):
        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if asg_name is not None:
            self._replacing["ASG_NAME"] = asg_name
        else:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

    # TODO (#157) Version should only be uploaded during install and upgrade time
    def _update_version(self):
        # Software info we get during install / upgrade does not contain ami id
        # need to persist it as well
        self._software_info.ami_id = self._cluster_config.get_ami_id()

        AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id,
                  self._aws_profile).update(self._software_info.to_dict())
Exemple #4
0
class AXClusterBuckets(object):
    """
    Bucket created in target account, same as cluster account.
    """
    def __init__(self, name_id, aws_profile, aws_region):
        self._name_id = name_id
        self._aws_profile = aws_profile
        self._aws_region = aws_region
        self.cluster_config = AXClusterConfig(cluster_name_id=self._name_id)

    def update(self):
        logger.info("Creating and updating all cluster buckets ...")
        self._update_cluster_bucket()
        self._update_data_bucket()
        logger.info("Creating and updating all cluster buckets ... DONE")

    def delete(self):
        logger.info("Deleting all cluster buckets ...")
        self._delete_cluster_bucket()
        self._delete_data_bucket()
        logger.info("Deleting all cluster buckets ... DONE")

    def _update_cluster_bucket(self):
        bucket_name = AXClusterConfigPath(name_id=self._name_id).bucket()
        cluster_bucket = Cloud().get_bucket(bucket_name,
                                            aws_profile=self._aws_profile,
                                            region=self._aws_region)

        if not cluster_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                cluster_bucket.get_bucket_name()))
        logger.info("Created %s bucket ... DONE",
                    cluster_bucket.get_bucket_name())

    def _update_data_bucket(self):
        data_bucket = Cloud().get_bucket(
            AXClusterDataPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)

        if not data_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                data_bucket.get_bucket_name()))

        if self.cluster_config.get_cluster_provider() != ClusterProvider.USER:
            # Update CORS config for data bucket too.
            logger.info("Checking CORS config for %s.",
                        data_bucket.get_bucket_name())
            data_bucket.put_cors(DATA_CORS_CONFIG)

        logger.info("Created %s bucket ... DONE",
                    data_bucket.get_bucket_name())

    def _delete_cluster_bucket(self):
        logger.info(
            "Deleting applatix-cluster bucket contents for cluster %s ...",
            self._name_id)
        cluster_bucket = Cloud().get_bucket(
            AXClusterConfigPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)

        idobj = AXClusterId(name=self._name_id)
        cluster_config_path = AXClusterConfigPath(name_id=self._name_id)
        cluster_name = idobj.get_cluster_name()
        prefix = cluster_name + "/"

        # TODO: Not idempotent here.
        # Consider the following case: if there is exception thrown when deleting S3 objects, install stage 1
        # information has already been deleted but not everything are successfully deleted, the next time user
        # executes "delete", this program will assume install stage 1 has been cleaned up.
        exempt = [
            idobj.get_cluster_id_s3_key(),
            cluster_config_path.cluster_install_stage0_key()
        ]
        logger.info(
            "Deleting objects for cluster %s from bucket %s. This may take some while.",
            cluster_name, cluster_bucket.get_bucket_name())
        cluster_bucket.delete_all(obj_prefix=prefix, exempt=exempt)
        logger.info("Deleting objects for cluster %s from bucket %s ... DONE",
                    cluster_name, cluster_bucket.get_bucket_name())
        logger.info("Deleting stage0 information ...")
        for item in exempt:
            cluster_bucket.delete_object(item)
        logger.info("Deleting stage0 information ... DONE")

    def _delete_data_bucket(self):
        logger.info(
            "Deleting applatix-data bucket contents for cluster %s ...",
            self._name_id)
        data_bucket = Cloud().get_bucket(
            AXClusterDataPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)
        cluster_name = AXClusterId(name=self._name_id).get_cluster_name()
        prefix = cluster_name + "/"
        logger.info(
            "Deleting objects for cluster %s from bucket %s. This may take some while.",
            cluster_name, data_bucket.get_bucket_name())
        data_bucket.delete_all(obj_prefix=prefix)
        logger.info("Deleting objects for cluster %s from bucket %s ... DONE",
                    cluster_name, data_bucket.get_bucket_name())
Exemple #5
0
class AXSYSKubeYamlUpdater(object):
    """
    This class loads a kubernetes yaml file, updates resource,
    and generate objects that kube_object.py can consume
    """
    def __init__(self, config_file_path):
        assert os.path.isfile(
            config_file_path), "Config file {} is not a file".format(
                config_file_path)
        self._config_file = config_file_path
        self._cluster_name_id = AXClusterId().get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id)
        if not self._cluster_config.get_cluster_provider().is_user_cluster():
            self.cpu_mult, self.mem_mult, self.disk_mult, \
                self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers()
        else:
            self.cpu_mult = 1
            self.mem_mult = 1
            self.disk_mult = 1
            self.daemon_cpu_mult = 1
            self.daemon_mem_mult = 1
        self._swagger_components = []
        self._yaml_components = []
        self._updated_raw = ""

        # TODO: when we support config software info using a config file, need to figure out how that
        # file gets passed through, since SoftwareInfo is not a singleton
        self._software_info = SoftwareInfo()

        self._load_objects()
        self._load_raw()

    @property
    def updated_raw(self):
        return self._updated_raw

    @property
    def components_in_dict(self):
        return self._yaml_components

    @property
    def components_in_swagger(self):
        return self._swagger_components

    def _load_objects(self):
        with open(self._config_file, "r") as f:
            data = f.read()
        for c in yaml.load_all(data):
            swagger_obj = self._config_yaml(c)
            yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj)
            self._swagger_components.append(swagger_obj)
            self._yaml_components.append(yaml_obj)

    def _load_raw(self):
        self._updated_raw = yaml.dump_all(self._yaml_components)

    def _get_resource_multipliers(self):
        """
        Resources in yaml templates need to be multiplied with these numbers
        :return: cpu_multiplier, mem_multiplier, disk_multiplier
        """
        # Getting cluster size from cluster config, in order to configure resources
        # There are 3 situations we will be using AXClusterConfig
        #   - During install, since the class is a singleton, it has all the values we need
        #     no need to download from s3
        #   - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download
        #     cluster config files from s3 to get the values
        #   - During job creation: the node axmon runs has the proper roles to access s3

        try:
            ax_node_max = int(self._cluster_config.get_asxys_node_count())
            ax_node_type = self._cluster_config.get_axsys_node_type()
            usr_node_max = int(
                self._cluster_config.get_max_node_count()) - ax_node_max
            usr_node_type = self._cluster_config.get_axuser_node_type()
            assert all(
                [ax_node_max, ax_node_type, usr_node_max, usr_node_type])
        except Exception as e:
            logger.error(
                "Unable to read cluster config, skip resource config for %s. Error %s",
                self._config_file, e)
            return 1, 1, 1, 1, 1

        rc = AXSYSResourceConfig(
            ax_node_type=ax_node_type,
            ax_node_max=ax_node_max,
            usr_node_type=usr_node_type,
            usr_node_max=usr_node_max,
            cluster_type=self._cluster_config.get_ax_cluster_type())
        #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)",
        #            ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file,
        #            rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier,
        #            rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier)
        return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier

    def _config_yaml(self, kube_yaml_obj):
        """
        Load dict into swagger object, patch resource,
        sanitize, return a dict
        :param kube_yaml_obj:
        :return: swagger object with resource values finalized
        """
        kube_kind = kube_yaml_obj["kind"]
        (swagger_class_literal,
         swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind]
        swagger_obj = ApiClient()._ApiClient__deserialize(
            kube_yaml_obj, swagger_class_literal)
        assert isinstance(swagger_obj, swagger_instance), \
            "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance)

        if isinstance(swagger_obj, V1beta1Deployment):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None

            node_selector = swagger_obj.spec.template.spec.node_selector
            if node_selector and node_selector.get('ax.tier',
                                                   'applatix') == 'master':
                # Skip updating containers on master.
                logger.info(
                    "Skip updating cpu, mem multipliers for pods on master: %s",
                    swagger_obj.metadata.name)
            else:
                for container in swagger_obj.spec.template.spec.containers:
                    self._update_container(container)
            return swagger_obj
        elif isinstance(swagger_obj, V1Pod):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.image_pull_secrets = None
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1DaemonSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            for container in swagger_obj.spec.template.spec.containers:
                # We are special-casing applet DaemonSet to compromise the fact that
                # we are using different node type for compute-intense nodes
                if swagger_obj.metadata.name == "applet":
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=True)
                else:
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=False)
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1StatefulSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            return self._update_statefulset(swagger_obj)
        elif isinstance(swagger_obj, V1PersistentVolumeClaim):
            self._update_volume(swagger_obj)
            return swagger_obj
        else:
            # logger.info("Object %s does not need to configure resource", type(swagger_obj))
            # HACK, as the original hook will be messed up
            if isinstance(swagger_obj, V1Service):
                if swagger_obj.metadata.name == "axops":
                    swagger_obj.spec.load_balancer_source_ranges = []
                    if self._cluster_config and self._cluster_config.get_trusted_cidr(
                    ):
                        for cidr in self._cluster_config.get_trusted_cidr():
                            # Seems swagger client does not support unicode ... SIGH
                            swagger_obj.spec.load_balancer_source_ranges.append(
                                str(cidr))

                # HACK #2: if we don't do this, kubectl will complain about something such as
                #
                # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z)
                #
                # p.target_port is defined as string though, but if its really a string, kubectl
                # is looking for a port name, rather than a number
                # SIGH ...
                for p in swagger_obj.spec.ports or []:
                    try:
                        p.target_port = int(p.target_port)
                    except (ValueError, TypeError):
                        pass
            return swagger_obj

    def _update_deployment_or_daemonset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1Deployment) or isinstance(
            kube_obj, V1beta1DaemonSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        return kube_obj

    def _update_statefulset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1StatefulSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        if isinstance(kube_obj.spec.volume_claim_templates, list):
            for vol in kube_obj.spec.volume_claim_templates:
                self._update_volume(vol)
        return kube_obj

    def _update_container(self,
                          container,
                          is_daemon=False,
                          update_resource=True):
        assert isinstance(container, V1Container)

        if update_resource:
            cpulim = container.resources.limits.get("cpu")
            memlim = container.resources.limits.get("memory")
            cpureq = container.resources.requests.get("cpu")
            memreq = container.resources.requests.get("memory")

            def _massage_cpu(orig):
                return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult

            def _massage_mem(orig):
                return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult

            if cpulim:
                rvc = ResourceValueConverter(value=cpulim, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.limits["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if cpureq:
                rvc = ResourceValueConverter(value=cpureq, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.requests["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if memlim:
                rvc = ResourceValueConverter(value=memlim, target="memory")
                rvc.massage(_massage_mem)
                container.resources.limits["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))
            if memreq:
                rvc = ResourceValueConverter(value=memreq, target="memory")
                rvc.massage(_massage_mem)
                container.resources.requests["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))

        if container.liveness_probe and container.liveness_probe.http_get:
            try:
                container.liveness_probe.http_get.port = int(
                    container.liveness_probe.http_get.port)
            except (ValueError, TypeError):
                pass
        if container.readiness_probe and container.readiness_probe.http_get:
            try:
                container.readiness_probe.http_get.port = int(
                    container.readiness_probe.http_get.port)
            except (ValueError, TypeError):
                pass

        # Add resource multiplier to containers in case we need them
        if not container.env:
            container.env = []
        container.env += self._generate_default_envs(is_daemon,
                                                     update_resource)

    def _update_volume(self, vol):
        assert isinstance(vol, V1PersistentVolumeClaim)
        vol_size = vol.spec.resources.requests["storage"]

        def _massage_disk(orig):
            return orig * self.disk_mult

        if vol_size:
            rvc = ResourceValueConverter(value=vol_size, target="storage")
            rvc.massage(_massage_disk)
            # Since AWS does not support value such as 1.5G, lets round up to its ceil
            vol.spec.resources.requests["storage"] = "{}Gi".format(
                int(ceil(rvc.convert("Gi"))))

        # Manually patch access mode as swagger client mistakenly interprets this as map
        vol.spec.access_modes = ["ReadWriteOnce"]

    def _generate_default_envs(self, is_daemon, resource_updated):
        """
        Add essential variables to all system containers
        :param is_daemon:
        :return:
        """
        default_envs = [
            # Kubernetes downward APIs
            {
                "name": "AX_NODE_NAME",
                "path": "spec.nodeName"
            },
            {
                "name": "AX_POD_NAME",
                "path": "metadata.name"
            },
            {
                "name": "AX_POD_NAMESPACE",
                "path": "metadata.namespace"
            },
            {
                "name": "AX_POD_IP",
                "path": "status.podIP"
            },

            # Values
            {
                "name": "DISK_MULT",
                "value": str(self.disk_mult)
            },
            {
                "name": "AX_TARGET_CLOUD",
                "value": Cloud().target_cloud()
            },
            {
                "name": "AX_CLUSTER_NAME_ID",
                "value": self._cluster_name_id
            },
            {
                "name": "AX_CUSTOMER_ID",
                "value": AXCustomerId().get_customer_id()
            },
        ]

        aws_region = os.environ.get("AX_AWS_REGION", "")
        if aws_region != "":
            default_envs.append({"name": "AX_AWS_REGION", "value": aws_region})

        if os.getenv("ARGO_S3_ACCESS_KEY_ID", "") != "":
            # Secrets
            default_envs.append({
                "name": "ARGO_S3_ACCESS_KEY_ID",
                "secret": "argo-access-key"
            })
            default_envs.append({
                "name": "ARGO_S3_ACCESS_KEY_SECRET",
                "secret": "argo-secret-key"
            })
            default_envs.append({
                "name": "ARGO_S3_ENDPOINT",
                "value": os.getenv("ARGO_S3_ENDPOINT", None)
            })

        # Special cases for daemons
        if is_daemon:
            if resource_updated:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": str(self.daemon_cpu_mult)
                    },
                    {
                        "name": "MEM_MULT",
                        "value": str(self.daemon_mem_mult)
                    },
                ]
            else:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": "1.0"
                    },
                    {
                        "name": "MEM_MULT",
                        "value": "1.0"
                    },
                ]
        else:
            default_envs += [
                {
                    "name": "CPU_MULT",
                    "value": str(self.cpu_mult)
                },
                {
                    "name": "MEM_MULT",
                    "value": str(self.mem_mult)
                },
            ]

        rst = []
        for d in default_envs:
            var = V1EnvVar()
            var.name = d["name"]

            if d.get("path", None):
                field = V1ObjectFieldSelector()
                field.field_path = d["path"]
                src = V1EnvVarSource()
                src.field_ref = field
                var.value_from = src
            elif d.get("secret", None):
                secret = V1SecretKeySelector()
                secret.key = d["secret"]
                secret.name = d["secret"]
                src = V1EnvVarSource()
                src.secret_key_ref = secret
                var.value_from = src
            else:
                var.value = d["value"]
            rst.append(var)
        return rst
Exemple #6
0
class Deployment(object):
    """
    This class creates and manages a single deployment object
    A deployment consists of the following specifications in kubernetes
    1. A kubernetes deployment spec
    2. Zero or more kubernetes service specs
    3. Zero or more ingress rules

    All functions in the object need to be idempotent.
    """
    def __init__(self, name, application):
        """
        Each deployment has a name and needs to be part of an application
        Application maps to a kubernetes namespace and the deployment will
        be created in this namespace.

        Args:
            name: deployment name
            application: the application that this deployment runs under
        """
        self.name = name
        self.application = application
        self.client = KubernetesApiClient(use_proxy=True)
        self._nameid = AXClusterId().get_cluster_name_id()
        self._software_info = SoftwareInfo()

        self._app_obj = Application(application)

        self._resources = AXResources()
        self.spec = None

        self._cluster_config = AXClusterConfig()

    def create(self, spec):
        """
        Create a deployment from the template specified

        Idempotency: This function is idempotent. A create of identical spec will
        have no impact if the deployment already exists. If the spec is different
        then the existing deployment will be updated.
        """
        @retry_unless(status_code=[404, 422])
        def create_in_provider(k8s_spec):
            try:
                logger.info(
                    "Creating deployment %s in Kubernetes namespace %s",
                    self.name, self.application)
                self.client.apisappsv1beta1_api.create_namespaced_deployment(
                    k8s_spec, self.application)
                logger.info(
                    "Done creating deployment %s in Kubernetes namespace %s",
                    self.name, self.application)
            except swagger_client.rest.ApiException as e:
                if e.status == 409:
                    self.client.apisappsv1beta1_api.replace_namespaced_deployment(
                        k8s_spec, self.application, self.name)
                else:
                    raise e

        with DeploymentOperation(self):

            self.spec = spec

            # Do some template checks
            self._template_checks()

            # First create supplemental resources such as routes, ingress rules etc
            self._create_deployment_resources()

            # Now create the deployment spec
            d_spec = self._create_deployment_spec()

            # Store the resources in the deployment spec
            self._resources.finalize(d_spec)

            # Create the deployment object in kubernetes
            create_in_provider(d_spec)

    def delete(self, timeout=None):
        """
        Delete the deployment.

        Idempotency: This function is idempotent. If deployment does not exist then
        delete will silently fail without raising any exceptions.
        Args:
            timeout: In seconds or None for infinite
        """
        options = swagger_client.V1DeleteOptions()
        options.grace_period_seconds = 1
        options.orphan_dependents = False

        def check_result(result):
            # True for retry False for done
            return not result

        @retry(retry_on_result=check_result,
               wait_fixed=2000,
               stop_max_delay=timeout)
        def wait_for_scale_to_zero():
            logger.debug("Wait for scale of deployment to 0 for {} {}".format(
                self.application, self.name))

            @retry_unless(swallow_code=[404])
            def get_scale_from_provider():
                return self.client.apisappsv1beta1_api.read_namespaced_scale_scale(
                    self.application, self.name)

            scale = get_scale_from_provider()
            if scale is None:
                return True
            if scale.status.replicas == 0:
                return True

            return False

        @retry_unless(swallow_code=[404, 409])
        def delete_in_provider():
            logger.debug("Deleting deployment for {} {}".format(
                self.application, self.name))
            self.client.apisappsv1beta1_api.delete_namespaced_deployment(
                options, self.application, self.name)

        def delete_rs_in_provider():
            logger.debug("Deleting replica set for {} {}".format(
                self.application, self.name))
            self.client.extensionsv1beta1.deletecollection_namespaced_replica_set(
                self.application,
                label_selector="deployment={}".format(self.name))

        # now delete deployment object and replication set
        with DeploymentOperation(self):
            dep_obj = self._deployment_status()
            self._scale_to(0)
            wait_for_scale_to_zero()

            if dep_obj:
                resources = AXResources(existing=dep_obj)
                resources.delete_all()

            delete_in_provider()
            delete_rs_in_provider()

    def status(self):
        """
        Get the status of the deployment.
        Returns: Returns the entire V1Deployment as a dict.
        If deployment is not found then this will raise an AXNotFoundException (404)
        """
        # STEP 1: Get status of deployment
        stat = self._deployment_status()
        if stat is None:
            raise AXNotFoundException(
                "Deployment {} not found in application {}".format(
                    self.name, self.application))

        dep_field_map = {
            "name": "metadata.name",
            "generation": "metadata.annotations.ax_generation",
            "desired_replicas": "status.replicas",
            "available_replicas": "status.available_replicas",
            "unavailable_replicas": "status.unavailable_replicas"
        }
        ret = KubeObject.swagger_obj_extract(stat,
                                             dep_field_map,
                                             serializable=True)

        # STEP 2: Get the pods for the deployment and events associated
        podlist = self._deployment_pods().items
        dep_events = self._app_obj.events(name=self.name)
        event_field_map = {
            "message": "message",
            "reason": "reason",
            "source": "source.component",
            "host": "source.host",
            "firstTS": "first_timestamp",
            "lastTS": "last_timestamp",
            "count": "count",
            "container": "involved_object.field_path",
            "type": "type"
        }
        ret["events"] = []
        for event in dep_events:
            ret["events"].append(
                KubeObject.swagger_obj_extract(event,
                                               event_field_map,
                                               serializable=True))

        ret["pods"] = []
        for pod in podlist or []:
            # fill pod status and containers
            pstatus = Pod.massage_pod_status(pod)

            # fill events for pod
            pstatus["events"] = []

            events = self._app_obj.events(name=pod.metadata.name)
            for event in events:
                pstatus["events"].append(
                    KubeObject.swagger_obj_extract(event,
                                                   event_field_map,
                                                   serializable=True))

            # fill pod failure information for pod based on events
            pstatus["failure"] = Deployment._pod_failed_info(pstatus)

            ret["pods"].append(pstatus)

        # STEP 3: From the deployment spec get the resources created by deployment
        # TODO: Add this when services are created by deployment

        return ret

    def get_labels(self):
        """
        Get a dict of labels used for this deployment
        """
        state = self._deployment_status()
        if state is None:
            raise AXNotFoundException(
                "Did not find deployment {} in application {}".format(
                    self.name, self.application))

        return KubeObject.swagger_obj_extract(
            state, {"labels": "spec.selector.match_labels"})['labels']

    @staticmethod
    def _pod_failed_info(pod_status):
        if pod_status["phase"] != "Pending":
            return None

        for ev in pod_status["events"] or []:
            if ev["reason"] == "Failed" and ev["source"] == "kubelet" and ev["type"] == "Warning" and \
                            "Failed to pull image" in ev["message"] and ev["count"] > 5:
                return {"reason": "ImagePullFailure", "message": ev["message"]}
        return None

    def scale(self, replicas):
        with DeploymentOperation(self):
            # Deployments with volumes can't be scaled to > 1.
            if replicas > 1:
                dep_obj = self._deployment_status()
                if dep_obj:
                    resources = AXResources(existing=dep_obj)
                    for type in resources.get_all_types():
                        if type.startswith("ax.platform.volumes"):
                            raise AXApiForbiddenReq(
                                "Deployments with volumes can't be scaled to > 1 ({})"
                                .format(replicas))

            self._scale_to(replicas)

    @retry_unless(swallow_code=[404])
    def _deployment_status(self):
        return self.client.apisappsv1beta1_api.read_namespaced_deployment(
            self.application, self.name)

    @retry_unless(swallow_code=[404])
    def _deployment_pods(self):
        return self.client.api.list_namespaced_pod(
            self.application, label_selector="deployment={}".format(self.name))

    def _create_deployment_spec(self):

        pod_spec = PodSpec(self.name, namespace=self.application)
        main_container = self.spec.template.get_main_container()

        main_container_spec = self._create_main_container_spec(main_container)
        pod_spec.add_main_container(main_container_spec)

        container_vols = self._get_main_container_vols()
        main_container_spec.add_volumes(container_vols)

        hw_res = main_container.get_resources()
        main_container_spec.add_resource_constraints("cpu_cores",
                                                     hw_res.cpu_cores,
                                                     limit=None)
        main_container_spec.add_resource_constraints("mem_mib",
                                                     hw_res.mem_mib,
                                                     limit=None)

        artifacts_container = pod_spec.enable_artifacts(
            self._software_info.image_namespace,
            self._software_info.image_version, None, main_container.to_dict())
        secret_resources = artifacts_container.add_configs_as_vols(
            main_container.get_all_configs(), self.name, self.application)
        self._resources.insert_all(secret_resources)

        # Set up special circumstances based on annotations
        # Check if we need to circumvent the executor script. This is needed for containers that run
        # special init processes such as systemd as these processes like to be pid 1
        if main_container.executor_spec:
            main_container_spec.command = None
            if main_container.docker_spec is not None:
                raise ValueError(
                    "We do not support ax_ea_docker_enable with ax_ea_executor"
                )

        # Does this container need to be privileged
        main_container_spec.privileged = main_container.privileged

        # Check if docker daemon sidecar needs to be added
        if main_container.docker_spec:
            # graph storage size is specified in GiB
            dind_container_spec = pod_spec.enable_docker(
                main_container.docker_spec.graph_storage_size_mib)
            dind_container_spec.add_volumes(pod_spec.get_artifact_vols())
            dind_container_spec.add_resource_constraints(
                "cpu_cores", main_container.docker_spec.cpu_cores, limit=None)
            dind_container_spec.add_resource_constraints(
                "mem_mib", main_container.docker_spec.mem_mib, limit=None)
            dind_container_spec.add_volumes(container_vols)

        # Do we only need docker graph storage volume for the main container
        if main_container.graph_storage:
            dgs_vol = ContainerVolume("graph-storage-vol-only",
                                      main_container.graph_storage.mount_path)
            dgs_vol.set_type(
                "DOCKERGRAPHSTORAGE",
                main_container.graph_storage.graph_storage_size_mib)
            main_container_spec.add_volume(dgs_vol)

        # set the pod hostname to value provided in main container spec
        pod_spec.hostname = main_container.hostname

        # TODO: This needs fixup. job name is used in init container to ask permission to start
        # TODO: Don't know if this is needed in deployment or not?
        artifacts_container.add_env("AX_JOB_NAME", value=self.application)
        artifacts_container.add_env("AX_DEPLOYMENT_NEW", value="True")

        if len(container_vols) > 0:
            tmp_container_vols = copy.deepcopy(container_vols)
            volume_paths = []
            for v in tmp_container_vols:
                v.set_mount_path("/ax/fix" + v.volmount.mount_path)
                volume_paths.append(v.volmount.mount_path)
            artifacts_container.add_volumes(tmp_container_vols)
            logger.info("Volumes to chmod: %s", volume_paths)
            artifacts_container.add_env("AX_VOL_MOUNT_PATHS",
                                        value=str(volume_paths))

        # add annotation for service env which will show up in artifacts container
        pod_spec.add_annotation("AX_SERVICE_ENV",
                                self._generate_service_env(self.spec.template))
        pod_spec.add_annotation("AX_IDENTIFIERS", self._get_identifiers())
        if self.spec.costid:
            pod_spec.add_annotation("ax_costid", json.dumps(self.spec.costid))

        pod_spec.add_label("deployment", self.name)
        pod_spec.add_label("application", self.application)
        pod_spec.add_label("tier", "user")
        pod_spec.add_label("deployment_id", self.spec.id)

        # now that pod is ready get its spec and wrap it in a deployment
        k8s_spec = self._generate_deployment_spec_for_pod(pod_spec.get_spec())

        logger.info("Generated Kubernetes spec for deployment %s", self.name)
        return k8s_spec

    def _create_main_container_spec(self, container_template):
        """
        :type container_template: argo.template.v1.container.ContainerTemplate
        :rtype Container
        """
        logger.debug("Container template is {}".format(container_template))

        name = string_to_dns_label(container_template.name)
        container_spec = Container(
            name,
            container_template.image,
            pull_policy=container_template.image_pull_policy)
        container_spec.parse_probe_spec(container_template)

        # Necessary envs for handshake
        container_spec.add_env("AX_HANDSHAKE_VERSION",
                               value=CUR_RECORD_VERSION)

        # Envs introduced to user
        container_spec.add_env("AX_POD_NAME", value_from="metadata.name")
        container_spec.add_env("AX_POD_IP", value_from="status.podIP")
        container_spec.add_env("AX_POD_NAMESPACE",
                               value_from="metadata.namespace")
        container_spec.add_env("AX_NODE_NAME", value_from="spec.nodeName")
        container_spec.add_env("AX_CLUSTER_META_URL_V1",
                               value=CLUSTER_META_URL_V1)

        # envs from user spec
        for env in container_template.env:
            (cfg_ns, cfg_name, cfg_key) = env.get_config()
            if cfg_ns is not None:
                secret = SecretResource(cfg_ns, cfg_name, self.name,
                                        self.application)
                secret.create()
                self._resources.insert(secret)
                container_spec.add_env(
                    env.name,
                    value_from_secret=(secret.get_resource_name(), cfg_key))
            else:
                container_spec.add_env(env.name, value=env.value)

        # Unix socket for applet
        applet_sock = ContainerVolume("applet", "/tmp/applatix.io/")
        applet_sock.set_type("HOSTPATH", "/var/run/")
        container_spec.add_volume(applet_sock)

        return container_spec

    @staticmethod
    def _get_valid_name_from_axrn(axrn):
        # AXRN's will have non-alphanumeric characters such as : / @, etc which K8S doesn't
        # like in its PVC name. Replace all non-alphanumeric characters with -.
        name_regex = re.compile(r"\W+")
        return name_regex.sub("-", axrn).replace("_", "-")

    def _get_main_container_vols(self):
        container_template = self.spec.template.get_main_container()
        ret = []

        for vol_name, vol in iteritems(container_template.inputs.volumes):
            # sanitize the volume name for kubernetes
            vol_name = string_to_dns_label(vol_name)
            cvol = ContainerVolume(vol_name, vol.mount_path)
            assert "resource_id" in vol.details, "Volume resource-id absent in volume details"
            assert "filesystem" in vol.details, "Volume filesystem absent in volume details"
            cvol.set_type("AWS_EBS", vol_name, vol.details["resource_id"],
                          vol.details["filesystem"])
            logger.debug("Volume {} {} mounted at {}".format(
                vol_name, vol.details, vol.mount_path))
            ret.append(cvol)

        return ret

    def _generate_service_env(self, template):
        return base64.b64encode(json.dumps(template.to_dict()))

    def _get_identifiers(self):
        return {
            "application_id": self.spec.app_generation,
            "deployment_id": self.spec.id,
            "static": {
                "application_id": self.spec.app_id,
                "deployment_id": self.spec.deployment_id
            }
        }

    def _generate_deployment_spec_for_pod(self, pod_spec):

        metadata = swagger_client.V1ObjectMeta()
        metadata.name = self.name

        dspec = swagger_client.V1beta1DeploymentSpec()
        dspec.strategy = self._get_strategy()
        if self.spec.template.min_ready_seconds:
            dspec.min_ready_seconds = self.spec.template.min_ready_seconds
        dspec.selector = swagger_client.V1LabelSelector()
        dspec.selector.match_labels = {"deployment": self.name}

        dspec.replicas = self.spec.template.scale.min
        dspec.template = pod_spec

        deployment_obj = swagger_client.V1beta1Deployment()
        deployment_obj.metadata = metadata
        deployment_obj.spec = dspec
        return deployment_obj

    def _create_deployment_resources(self):

        for route in self.spec.template.internal_routes:
            # ignore empty port spec
            if len(route.ports) == 0:
                logger.debug(
                    "Skipping internal route {} as port spec is empty".format(
                        route.name))
                continue
            ir = InternalRoute(route.name, self.application)
            ir.create(route.to_dict()["ports"],
                      selector={"deployment": self.name},
                      owner=self.name)
            self._resources.insert(ir)
            logger.debug("Created route {}".format(ir))

        for route in self.spec.template.external_routes:

            dns_name = route.dns_name()
            if dns_name.endswith("."):
                dns_name = dns_name[:-1]

            r = ExternalRoute(dns_name, self.application,
                              {"deployment": self.name}, route.target_port,
                              route.ip_white_list, route.visibility)

            elb_addr = None
            elb_name = None

            if not self._cluster_config.get_cluster_provider().is_user_cluster(
            ):
                try:
                    elb_addr = visibility_to_elb_addr(route.visibility)
                    elb_name = visibility_to_elb_name(route.visibility)
                except AXNotFoundException:
                    if route.visibility == ExternalRouteVisibility.VISIBILITY_WORLD:
                        raise AXNotFoundException(
                            "Could not find the public ELB. Please report this error to Applatix Support at [email protected]"
                        )
                    else:
                        assert route.visibility == ExternalRouteVisibility.VISIBILITY_ORGANIZATION, "Only world and organization are currently supported as visibility attributes"
                        raise AXNotFoundException(
                            "Please create a private ELB using the template named 'ax_private_elb_creator_workflow' before using 'visibility=organization'"
                        )

            name = r.create(elb_addr, elb_name=elb_name)
            self._resources.insert(r)
            logger.debug("Created external route {} for {}/{}/{}".format(
                name, self.application, self.name, dns_name))

        main_container = self.spec.template.get_main_container()
        for key_name, vol in iteritems(main_container.inputs.volumes):
            assert "resource_id" in vol.details, "Volume resource_id absent in volume details"
            name = vol.details.get("axrn", None)
            resource_id = vol.details.get("resource_id", None)
            assert name is not None and resource_id is not None, "axrn and resource_id are required details for volume {}".format(
                key_name)
            nv_res = AXNamedVolumeResource(name, resource_id)
            nv_res.create()
            self._resources.insert(nv_res)
            logger.debug(
                "Using named volume resource {} in application {}".format(
                    name, self.application))

    @retry_unless(status_code=[422], swallow_code=[400, 404])
    def _scale_to(self, replicas):
        logger.debug("Scaling deployment to {} for {} {}".format(
            replicas, self.application, self.name))
        scale = swagger_client.V1beta1Scale()
        scale.spec = swagger_client.V1beta1ScaleSpec()
        scale.spec.replicas = replicas
        scale.metadata = swagger_client.V1ObjectMeta()
        scale.metadata.name = self.name
        scale.metadata.namespace = self.application
        self.client.apisappsv1beta1_api.replace_namespaced_scale_scale(
            scale, self.application, self.name)

    def _template_checks(self):
        if self.spec.template.scale and self.spec.template.scale.min > 1 and len(
                self.spec.template.volumes) >= 1:
            raise ValueError(
                "Deployments with volumes can't have scale > 1 ({})".format(
                    self.spec.template.scale.min))

    def _get_strategy(self):
        s = swagger_client.V1beta1DeploymentStrategy()
        s.type = "RollingUpdate" if self.spec.template.strategy.type == "rolling_update" else "Recreate"
        if s.type == "RollingUpdate":
            rolling_update = swagger_client.V1beta1RollingUpdateDeployment()
            rolling_update.max_unavailable = self.spec.template.strategy.rolling_update.max_unavailable
            rolling_update.max_surge = self.spec.template.strategy.rolling_update.max_surge
            s.rolling_update = rolling_update
        return s