def update_cluster_config(self):
        """
        Upgrade the cluster config in S3 such that it has all required fields.
        """
        logger.info("Updating cluster config!")
        cluster_config = AXClusterConfig(cluster_name_id=self._cluster_name_id,
                                         aws_profile=self._profile)
        cluster_info = AXClusterInfo(cluster_name_id=self._cluster_name_id,
                                     aws_profile=self._profile)

        # Separate axsys / axuser config if needed
        update_node_config_key_needed = False
        try:
            # New cluster config is looking for "max_node_count" for this method and
            # should throw KeyError if the cluster config in s3 was the old one
            cluster_config.get_max_node_count()
        except KeyError:
            update_node_config_key_needed = True

        if update_node_config_key_needed:
            logger.info("Updating node config keys ...")
            # Parse old raw config directly
            minion_type = cluster_config._conf["cloud"]["configure"][
                "minion_type"]
            max_count = cluster_config._conf["cloud"]["configure"]["max_count"]
            min_count = cluster_config._conf["cloud"]["configure"]["min_count"]
            axsys_count = cluster_config._conf["cloud"]["configure"][
                "axsys_nodes"]

            # Remove all old keys
            for old_key in [
                    "minion_type", "max_count", "min_count", "axsys_nodes"
            ]:
                cluster_config._conf["cloud"]["configure"].pop(old_key, None)

            # Setting new keys
            cluster_config._conf["cloud"]["configure"][
                "axsys_node_count"] = axsys_count
            cluster_config._conf["cloud"]["configure"][
                "max_node_count"] = max_count
            cluster_config._conf["cloud"]["configure"][
                "min_node_count"] = min_count

            # All clusters that needs this upgrade has same node type for axsys and axuser
            cluster_config._conf["cloud"]["configure"][
                "axuser_node_type"] = minion_type
            cluster_config._conf["cloud"]["configure"][
                "axsys_node_type"] = minion_type
        else:
            logger.info("Node config keys are already up-to-date")

        # If cluster type is not set, default it to standard type
        if cluster_config.get_ax_cluster_type() == None:
            cluster_config._conf["cloud"]["configure"][
                "cluster_type"] = AXClusterType.STANDARD

        # Check and update Cluster user. Defaults to "customer"
        if cluster_config.get_ax_cluster_user() is None:
            cluster_config.set_ax_cluster_user('customer')

        # Check and update Cluster size. Defaults to "small"
        if cluster_config.get_ax_cluster_size() is None:
            max_count = cluster_config.get_max_node_count()
            if max_count == 5:
                cluster_size = "small"
            elif max_count == 10:
                cluster_size = "medium"
            elif max_count == 21:
                cluster_size = "large"
            elif max_count == 30:
                cluster_size = "xlarge"
            else:
                cluster_size = "small"
            cluster_config.set_ax_cluster_size(cluster_size)

        # Check and update AX Volume size. Note that this has to come *AFTER* the cluster_size is set.
        if cluster_config.get_ax_vol_size() is None:
            cluster_size = cluster_config.get_ax_cluster_size()
            if cluster_size in ("small", "medium"):
                vol_size = 100
            elif cluster_size == "large":
                vol_size = 200
            elif cluster_size == "xlarge":
                vol_size = 400
            else:
                vol_size = 100
            cluster_config.set_ax_vol_size(vol_size)

        # Ensure that we have 3 tiers now
        cluster_config.set_node_tiers("master/applatix/user")

        # set new ami id
        ami_name = os.getenv("AX_AWS_IMAGE_NAME")
        ami_id = AMI(
            aws_region=self._region,
            aws_profile=self._profile).get_ami_id_from_name(ami_name=ami_name)
        logger.info("Updating cluster config with ami %s", ami_id)
        cluster_config.set_ami_id(ami_id)

        cluster_config.save_config()
Beispiel #2
0
class AXSYSKubeYamlUpdater(object):
    """
    This class loads a kubernetes yaml file, updates resource,
    and generate objects that kube_object.py can consume
    """
    def __init__(self, config_file_path):
        assert os.path.isfile(
            config_file_path), "Config file {} is not a file".format(
                config_file_path)
        self._config_file = config_file_path
        self._cluster_name_id = AXClusterId().get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id)
        self.cpu_mult, self.mem_mult, self.disk_mult, \
            self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers()
        self._swagger_components = []
        self._yaml_components = []
        self._updated_raw = ""

        # TODO: when we support config software info using a config file, need to figure out how that
        # file gets passed through, since SoftwareInfo is not a singleton
        self._software_info = SoftwareInfo()

        self._load_objects()
        self._load_raw()

    @property
    def updated_raw(self):
        return self._updated_raw

    @property
    def components_in_dict(self):
        return self._yaml_components

    @property
    def components_in_swagger(self):
        return self._swagger_components

    def _load_objects(self):
        with open(self._config_file, "r") as f:
            data = f.read()
        for c in yaml.load_all(data):
            swagger_obj = self._config_yaml(c)
            yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj)
            self._swagger_components.append(swagger_obj)
            self._yaml_components.append(yaml_obj)

    def _load_raw(self):
        self._updated_raw = yaml.dump_all(self._yaml_components)

    def _get_resource_multipliers(self):
        """
        Resources in yaml templates need to be multiplied with these numbers
        :return: cpu_multiplier, mem_multiplier, disk_multiplier
        """
        # Getting cluster size from cluster config, in order to configure resources
        # There are 3 situations we will be using AXClusterConfig
        #   - During install, since the class is a singleton, it has all the values we need
        #     no need to download from s3
        #   - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download
        #     cluster config files from s3 to get the values
        #   - During job creation: the node axmon runs has the proper roles to access s3

        try:
            ax_node_max = int(self._cluster_config.get_asxys_node_count())
            ax_node_type = self._cluster_config.get_axsys_node_type()
            usr_node_max = int(
                self._cluster_config.get_max_node_count()) - ax_node_max
            usr_node_type = self._cluster_config.get_axuser_node_type()
            assert all(
                [ax_node_max, ax_node_type, usr_node_max, usr_node_type])
        except Exception as e:
            logger.error(
                "Unable to read cluster config, skip resource config for %s. Error %s",
                self._config_file, e)
            return 1, 1, 1, 1, 1

        rc = AXSYSResourceConfig(
            ax_node_type=ax_node_type,
            ax_node_max=ax_node_max,
            usr_node_type=usr_node_type,
            usr_node_max=usr_node_max,
            cluster_type=self._cluster_config.get_ax_cluster_type())
        #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)",
        #            ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file,
        #            rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier,
        #            rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier)
        return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier

    def _config_yaml(self, kube_yaml_obj):
        """
        Load dict into swagger object, patch resource,
        sanitize, return a dict
        :param kube_yaml_obj:
        :return: swagger object with resource values finalized
        """
        kube_kind = kube_yaml_obj["kind"]
        (swagger_class_literal,
         swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind]
        swagger_obj = ApiClient()._ApiClient__deserialize(
            kube_yaml_obj, swagger_class_literal)
        assert isinstance(swagger_obj, swagger_instance), \
            "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance)

        if isinstance(swagger_obj, V1beta1Deployment):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None

            node_selector = swagger_obj.spec.template.spec.node_selector
            if node_selector.get('ax.tier', 'applatix') == 'master':
                # Skip updating containers on master.
                logger.info(
                    "Skip updating cpu, mem multipliers for pods on master: %s",
                    swagger_obj.metadata.name)
            else:
                for container in swagger_obj.spec.template.spec.containers:
                    self._update_container(container)
            return swagger_obj
        elif isinstance(swagger_obj, V1Pod):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.image_pull_secrets = None
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1DaemonSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            for container in swagger_obj.spec.template.spec.containers:
                # We are special-casing applet DaemonSet to compromise the fact that
                # we are using different node type for compute-intense nodes
                if swagger_obj.metadata.name == "applet":
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=True)
                else:
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=False)
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1StatefulSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            return self._update_statefulset(swagger_obj)
        elif isinstance(swagger_obj, V1PersistentVolumeClaim):
            self._update_volume(swagger_obj)
            return swagger_obj
        else:
            # logger.info("Object %s does not need to configure resource", type(swagger_obj))
            # HACK, as the original hook will be messed up
            if isinstance(swagger_obj, V1Service):
                if swagger_obj.metadata.name == "axops":
                    swagger_obj.spec.load_balancer_source_ranges = []
                    for cidr in self._cluster_config.get_trusted_cidr():
                        # Seems swagger client does not support unicode ... SIGH
                        swagger_obj.spec.load_balancer_source_ranges.append(
                            str(cidr))

                # HACK #2: if we don't do this, kubectl will complain about something such as
                #
                # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z)
                #
                # p.target_port is defined as string though, but if its really a string, kubectl
                # is looking for a port name, rather than a number
                # SIGH ...
                for p in swagger_obj.spec.ports or []:
                    try:
                        p.target_port = int(p.target_port)
                    except (ValueError, TypeError):
                        pass
            return swagger_obj

    def _update_deployment_or_daemonset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1Deployment) or isinstance(
            kube_obj, V1beta1DaemonSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        return kube_obj

    def _update_statefulset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1StatefulSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        if isinstance(kube_obj.spec.volume_claim_templates, list):
            for vol in kube_obj.spec.volume_claim_templates:
                self._update_volume(vol)
        return kube_obj

    def _update_container(self,
                          container,
                          is_daemon=False,
                          update_resource=True):
        assert isinstance(container, V1Container)

        if update_resource:
            cpulim = container.resources.limits.get("cpu")
            memlim = container.resources.limits.get("memory")
            cpureq = container.resources.requests.get("cpu")
            memreq = container.resources.requests.get("memory")

            def _massage_cpu(orig):
                return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult

            def _massage_mem(orig):
                return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult

            if cpulim:
                rvc = ResourceValueConverter(value=cpulim, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.limits["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if cpureq:
                rvc = ResourceValueConverter(value=cpureq, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.requests["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if memlim:
                rvc = ResourceValueConverter(value=memlim, target="memory")
                rvc.massage(_massage_mem)
                container.resources.limits["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))
            if memreq:
                rvc = ResourceValueConverter(value=memreq, target="memory")
                rvc.massage(_massage_mem)
                container.resources.requests["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))

        if container.liveness_probe and container.liveness_probe.http_get:
            try:
                container.liveness_probe.http_get.port = int(
                    container.liveness_probe.http_get.port)
            except (ValueError, TypeError):
                pass
        if container.readiness_probe and container.readiness_probe.http_get:
            try:
                container.readiness_probe.http_get.port = int(
                    container.readiness_probe.http_get.port)
            except (ValueError, TypeError):
                pass

        # Add resource multiplier to containers in case we need them
        if not container.env:
            container.env = []
        container.env += self._generate_default_envs(is_daemon,
                                                     update_resource)

    def _update_volume(self, vol):
        assert isinstance(vol, V1PersistentVolumeClaim)
        vol_size = vol.spec.resources.requests["storage"]

        def _massage_disk(orig):
            return orig * self.disk_mult

        if vol_size:
            rvc = ResourceValueConverter(value=vol_size, target="storage")
            rvc.massage(_massage_disk)
            # Since AWS does not support value such as 1.5G, lets round up to its ceil
            vol.spec.resources.requests["storage"] = "{}Gi".format(
                int(ceil(rvc.convert("Gi"))))

        # Manually patch access mode as swagger client mistakenly interprets this as map
        vol.spec.access_modes = ["ReadWriteOnce"]

    def _generate_default_envs(self, is_daemon, resource_updated):
        """
        Add essential variables to all system containers
        :param is_daemon:
        :return:
        """
        default_envs = [
            # Kubernetes downward APIs
            {
                "name": "AX_NODE_NAME",
                "path": "spec.nodeName"
            },
            {
                "name": "AX_POD_NAME",
                "path": "metadata.name"
            },
            {
                "name": "AX_POD_NAMESPACE",
                "path": "metadata.namespace"
            },
            {
                "name": "AX_POD_IP",
                "path": "status.podIP"
            },

            # Values
            {
                "name": "DISK_MULT",
                "value": str(self.disk_mult)
            },
            {
                "name": "AX_TARGET_CLOUD",
                "value": Cloud().target_cloud()
            },
            {
                "name": "AX_CLUSTER_NAME_ID",
                "value": self._cluster_name_id
            },
            {
                "name": "AX_CUSTOMER_ID",
                "value": AXCustomerId().get_customer_id()
            },
        ]

        # Special cases for daemons
        if is_daemon:
            if resource_updated:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": str(self.daemon_cpu_mult)
                    },
                    {
                        "name": "MEM_MULT",
                        "value": str(self.daemon_mem_mult)
                    },
                ]
            else:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": "1.0"
                    },
                    {
                        "name": "MEM_MULT",
                        "value": "1.0"
                    },
                ]
        else:
            default_envs += [
                {
                    "name": "CPU_MULT",
                    "value": str(self.cpu_mult)
                },
                {
                    "name": "MEM_MULT",
                    "value": str(self.mem_mult)
                },
            ]

        rst = []
        for d in default_envs:
            var = V1EnvVar()
            var.name = d["name"]

            if d.get("path", None):
                field = V1ObjectFieldSelector()
                field.field_path = d["path"]
                src = V1EnvVarSource()
                src.field_ref = field
                var.value_from = src
            else:
                var.value = d["value"]
            rst.append(var)
        return rst