Beispiel #1
0
class AXPlatform(object):
    def __new__(cls, *args, **kwargs):
        if Cloud().target_cloud_gcp():
            from .gke_platform import AXGKEPlatform
            return super(AXPlatform, cls).__new__(AXGKEPlatform)
        else:
            return super(AXPlatform, cls).__new__(cls)

    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}

    def _load_kube_objects_from_steps(self, steps):
        """
        Extract kube objects from steps in config, and load them into memory
        :param steps: list
        :return:
        """
        for object_group in steps:
            assert isinstance(object_group, AXPlatformObjectGroup)
            for obj in object_group.object_set:
                assert isinstance(obj, AXPlatformObject)
                name = obj.name
                filename = obj.manifest
                namespace = obj.namespace
                if name in self._kube_objects:
                    raise ValueError("Duplicated object name {}".format(name))
                kubeobj_conf_path = os.path.join(self._manifest_root, filename)
                self._kube_objects[name] = KubeObject(
                    config_file=kubeobj_conf_path,
                    kubepoll=self.kube_poll,
                    replacing=None,
                    kube_config=self._kube_config,
                    kube_namespace=namespace)

    def _generate_replacing(self):
        # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method
        # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and
        # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid
        # yaml files, and therefore we construct string manually here
        trusted_cidr = self._cluster_config.get_trusted_cidr()
        if isinstance(trusted_cidr, list):
            trusted_cidr_str = "["
            for cidr in trusted_cidr:
                trusted_cidr_str += "\"{}\",".format(str(cidr))
            trusted_cidr_str = trusted_cidr_str[:-1]
            trusted_cidr_str += "]"
        else:
            trusted_cidr_str = "[{}]".format(trusted_cidr)

        axsys_cpu = 0
        axsys_mem = 0
        daemon_cpu = 0
        daemon_mem = 0
        for name in self._kube_objects.keys():
            cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage
            axsys_cpu += cpu
            axsys_mem += mem
            daemon_cpu += dcpu
            daemon_mem += dmem

        # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not
        # have a memory request, but this is an approximation)
        daemon_cpu += 100
        daemon_mem += 100

        logger.info(
            "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi",
            axsys_cpu, axsys_mem, daemon_cpu, daemon_mem)

        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = str(
            int(self._cluster_config.get_min_node_count()) - axsys_node_count)
        axuser_max_count = str(
            int(self._cluster_config.get_max_node_count()) - axsys_node_count)
        autoscaler_scan_interval = str(
            self._cluster_config.get_autoscaler_scan_interval())

        usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["cpu"]
        usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["memory"]
        scale_down_util_thresh = round(
            max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001
        logger.info("Setting node scale down utilization threshold to %s",
                    scale_down_util_thresh)

        self._persist_node_resource_rsvp(daemon_cpu, daemon_mem)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if not asg_name:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

        # Prepare minion-manager.
        spot_instances_option = self._cluster_config.get_spot_instances_option(
        )
        minion_manager_asgs = ""
        if spot_instances_option == SpotInstanceOption.ALL_SPOT:
            for asg in asg_manager.get_all_asgs():
                minion_manager_asgs = minion_manager_asgs + asg[
                    "AutoScalingGroupName"] + " "
            minion_manager_asgs = minion_manager_asgs[:-1]
        elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT:
            minion_manager_asgs = asg_manager.get_variable_asg(
            )["AutoScalingGroupName"]

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "ASG_MIN":
            axuser_min_count,
            "ASG_MAX":
            axuser_max_count,
            "AUTOSCALER_SCAN_INTERVAL":
            autoscaler_scan_interval,
            "SCALE_DOWN_UTIL_THRESH":
            str(scale_down_util_thresh),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "ASG_NAME":
            asg_name,
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "AX_ENABLE_SPOT_INSTANCES":
            str(spot_instances_option != SpotInstanceOption.NO_SPOT),
            "AX_SPOT_INSTANCE_ASGS":
            minion_manager_asgs,
        }

    def _persist_node_resource_rsvp(self, user_node_daemon_cpu,
                                    user_node_daemon_mem):
        self._cluster_config.set_user_node_resource_rsvp(
            cpu=user_node_daemon_cpu, mem=user_node_daemon_mem)
        self._cluster_config.save_config()

    def start(self):
        """
        Bring up platform using "platform-start.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects
        steps = self._config.steps
        self._load_kube_objects_from_steps(steps)
        self._replacing = self._generate_replacing()

        # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32)
        # At this moment, we MUST separate first step due to the above dependency
        assert len(steps) >= 2, "Should have at least 1 step to create axops"
        self.create_objects(steps[0])
        self.create_objects(steps[1])
        self.create_objects(steps[2])

        # Prepare axops_eip
        self._set_ext_dns()

        logger.debug("Replacing ENVs: %s", self._replacing)

        info_bound = "=======================================================\n"
        img_namespace = "Image Namespace: {}\n".format(
            self._software_info.image_namespace)
        img_version = "Image Version: {}\n".format(
            self._software_info.image_version)
        start_info = "\n\n{}{}{}{}{}".format(
            info_bound, "Platform Up: Bringing up Argo services...\n",
            img_namespace, img_version, info_bound)
        logger.info(start_info)

        # Start rest of the objects
        for i in range(3, len(steps)):
            self.create_objects(steps[i])

        # update application namespace
        logger.info("Updating application managers")
        for app in Applications(client=self.kubectl).list():
            logger.info("--- updating {}".format(app))
            a = Application(app, client=self.kubectl)
            a.create(force_recreate=True)
        logger.info("Done updating application managers")

        # Upload version information to target cluster
        self._update_version()
        logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n",
                    COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name,
                    COLOR_NORM)

    def stop(self):
        """
        Bring down platform using "platform-stop.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects (Does not need to generate replacing during platform down)
        # Stop order should be the reverse of start
        steps = self._config.steps
        steps.reverse()
        self._load_kube_objects_from_steps(steps)

        info_bound = "=======================================================\n"
        stop_info = "\n\n{}{}{}".format(
            info_bound, "Platform Down: Shutting down Argo services...\n",
            info_bound)
        logger.info(stop_info)

        # Bring down objects according to steps
        for i in range(len(steps)):
            object_group = steps[i]
            self.delete_objects(object_group)

    def stop_monitor(self):
        self._monitor.stop()

    def create_objects(self, objects):
        """
        Start kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_create_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate,
                consistency=objects.consistency):
            logger.debug(
                "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)",
                objects.name, objects.policy, objects.policy_predicate,
                objects.consistency)
            return
        logger.info("Create step: %s", objects.name)
        logger.info("Creating platform objects\n\n%s",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.start_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Create")
        logger.info(report)

        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_create_group(self, policy, policy_predicate, consistency):
        """
        Take AXPlatformObjectGroup policy, predicate and consistency and determine
        if this group should be created or not
        :param policy:
        :param policy_predicate:
        :param consistency:
        :return:
        """
        # Since we are not using consistency, we should always create if not
        # explicitly told not to, i.e. if there is a PrivateRegistryOnly
        # We are just leaving the interface here that should create or not
        # need to be decided by policy, policy_predicate and consistency

        if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \
                not self._software_info.registry_is_private():
            return False
        return True

    def delete_objects(self, objects):
        """
        Stop kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_delete_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate):
            logger.debug(
                "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)",
                objects.name, objects.policy, objects.policy_predicate)
            return
        logger.info("Delete step: %s", objects.name)
        logger.info("Deleting platform objects\n\n%s.",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.stop_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Delete")
        logger.info(report)
        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_delete_group(self, policy, policy_predicate):
        """
        Take AXPlatformObjectGroup policy and determine if this group should be deleted or not.
        Consistency is not needed
        for deletion

        :param policy:
        :param policy_predicate:
        :return:
        """
        if policy == ObjectGroupPolicy.CreateMany:
            return True
        return False

    def start_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Creating %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]

        # Update them as there are new updates in replacing in platform start
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing

        assert isinstance(kube_obj, KubeObject)
        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }
        if kube_obj.healthy():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        # Previous platform start might fail, and might result in some componenets created
        # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing
        # object and try to create a new one
        if kube_obj.exists():
            logger.warning(
                "Object %s exists but not healthy. Deleting object for idempotency ...",
                name)
            self.stop_one(name, namespace)

        assert not kube_obj.exists(
        ), "Kubeobject {} already created but is not healthy. Not Expected".format(
            name)

        monitor_info = kube_obj.get_create_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout,
                    waiter)

            # Call kubectl create
            kube_obj.create()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                result["code"].append("{:.25s}:{}".format(
                    wait_info["name"], waiter.result))
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    logger.info("Successfully created %s with code %s.",
                                wait_info["name"], waiter.result)
                else:
                    result["failed"] = True
                    logger.error(
                        "Failed to create %s in %s with code %s. Events: %s",
                        wait_info["name"], namespace, waiter.result,
                        str(waiter.details))
                    if not self._debug:
                        logger.info("Deleting %s due to creation failure",
                                    name)
                        del_rst = self.stop_one(name, namespace)
                        result["code"] += del_rst["code"]
                        result["events"] += del_rst["events"]
                        result["duration"] = str(round(time.time() - start, 2))
                        return result

            # Poll extra if required (for Petset and Deployments with multiple replicas)
            if kube_obj.extra_poll:
                logger.info(
                    "Polling till healthy to make sure rest of components of %s are up and running ...",
                    name)
                create_rst = self._poll_till_healthy(
                    name=name,
                    kube_obj=kube_obj,
                    start_time=start,
                    poll_interval=AXPlatformConfigDefaults.
                    ObjCreateExtraPollInterval,
                    poll_max_retry=AXPlatformConfigDefaults.
                    ObjCreateExtraPollMaxRetry,
                    rst=result)
                if create_rst["failed"] and not self._debug:
                    logger.info("Deleting %s due to creation failure", name)
                    del_rst = self.stop_one(name, namespace)
                    create_rst["code"] += del_rst["code"]
                    create_rst["events"] += del_rst["events"]
                    create_rst["duration"] = str(round(time.time() - start, 2))
                return create_rst

            # Poll once to confirm all components from this Kubernetes config file exist,
            # In case there are objects in this config file cannot be monitored, i.e. svc
            # without elb. This is really not expected so we don't delete it
            if not kube_obj.healthy():
                logger.error(
                    "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.",
                    name)
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNHEALTHY))
                result["failed"] = True
                result["events"].append(
                    "Object {} created byt is not healthy".format(name))
            result["duration"] = str(round(time.time() - start, 2))

            if not result["failed"]:
                logger.info("Successfully created object %s.", name)
            return result
        else:
            # use polling
            kube_obj.create()
            create_rst = self._poll_till_healthy(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry,
                rst=result)
            if create_rst["failed"] and not self._debug:
                logger.info("Deleting %s due to creation failure", name)
                del_rst = self.stop_one(name, namespace)
                create_rst["code"] += del_rst["code"]
                create_rst["events"] += del_rst["events"]
                create_rst["duration"] = str(round(time.time() - start, 2))
            return create_rst

    @staticmethod
    def _poll_till_healthy(name, kube_obj, start_time, poll_interval,
                           poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if not kube_obj.healthy():
                trail += 1
                if trail > poll_max_retry:

                    logger.error("Failed to create KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY)
                    ]
                    rst["events"] += [
                        "Object {} creation timeout. Not healthy".format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully created %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.OK)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def stop_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Deleting %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing
        assert isinstance(kube_obj, KubeObject)

        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }

        # Don't delete if object does not exist
        if not kube_obj.exists():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        monitor_info = kube_obj.get_delete_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout,
                    waiter)

            # Call kubectl delete
            kube_obj.delete()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.DELETED))
                    logger.info("Successfully deleted %s in %s with code %s.",
                                wait_info["name"], name, result["code"])
                else:
                    result["failed"] = True
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.UNKNOWN))
                    logger.error(
                        "Failed to delete %s in %s with code %s. Events: %s",
                        wait_info["name"], name, result["code"],
                        str(waiter.details))

            # Poll once to confirm all components from this Kubenetes config file exist
            # In case there are objects in this config file cannot be monitored, i.e. svc without elb
            if kube_obj.exists():
                logger.error("Object %s deleted but still exists", name)
                result["failed"] = True
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNKNOWN))
                result["events"].append(
                    "Object {} deleted but still exists.".format(name))
            result["duration"] = str(round(time.time() - start, 2))
            logger.info("Successfully deleted %s.", name)
            return result
        else:
            # use polling
            kube_obj.delete()
            return self._poll_till_not_exists(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry,
                rst=result)

    @staticmethod
    def _poll_till_not_exists(name, kube_obj, start_time, poll_interval,
                              poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if kube_obj.exists():
                trail += 1
                if trail > poll_max_retry:
                    logger.error("Failed to delete KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN)
                    ]
                    rst["events"] += [
                        "Object {} deletion timeout. Please manually check remaining pods"
                        .format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully deleted %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def _generate_object_summary(self, objects):
        """
        :param objects: list of AXPlatformObject
        :return:
        """
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "MANIFEST", "NAMESPACE")
        report_bar = "{}\n".format("-" * 174)
        content = ""
        for obj in objects:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            filename = os.path.join(self._manifest_root, obj.manifest)
            namespace = obj.namespace
            content += "{:25s} |  {:110s} |  {:20s}\n".format(
                name, filename, namespace)

        return report_title + report_bar + content

    @staticmethod
    def _generate_report(results, operation):
        failed = False
        report_body = ""
        warnings = "\n======= WARNING EVENTS =======\n"
        for name in results.keys():
            individual_report = "{:25s} |  {:110s} |  {:20s}\n"
            individual_warning = "{name}: {events}\n\n"
            try:
                result = results[name].get()
                if result["failed"]:
                    failed = True
                code = result["code"][0]
                for c in result["code"][1:]:
                    code += " / {}".format(c)
                individual_report = individual_report.format(
                    name, code, result["duration"], 2)
                if len(result["events"]) > 0:
                    warnings += individual_warning.format(
                        name=name, events=str(result["events"]))
            except Exception as e:
                failed = True
                logger.exception(str(e))
                individual_report = individual_report.format(
                    name, "EXCEPTION", "UNKNOWN")
                warnings += individual_warning.format(name=name, events=str(e))
            report_body += individual_report

        report_head = "\n\nPlatform {} {}. Report:\n".format(
            operation, "FAILED" if failed else "SUCCESSFULLY")
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "STATUS", "TIME (sec)")
        report_bar = "{}\n".format("-" * 174)
        return "{}{}{}{}{}{}".format(
            report_head, report_title, report_bar, report_body, warnings,
            "==============================\n"), failed

    def _get_eip_from_config_map(self):
        try:
            cmd = [
                "kubectl", "get", "configmap", "cluster-dns-name", "-o",
                "yaml", "--namespace", self.kube_axsys_namespace,
                "--kubeconfig", self._kube_config
            ]
            out = subprocess.check_output(cmd)
            return [yaml.load(out)["data"]["cluster-external-dns-name"]]
        except Exception:
            logger.error("Failed to get cluster dns name from config map.")
            return None

    def _get_svc_eip(self, svclabel, namespace):
        svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace,
                                                  svclabel)
        assert len(
            svc.items) == 1, "Currently services should only have one ingress"
        rst = []
        for ig in svc.items[0].status.load_balancer.ingress:
            if ig.hostname:
                rst.append(ig.hostname)
            if ig.ip:
                rst.append(ig.ip)
        return rst

    def _set_ext_dns(self):
        axops_eip = self._get_eip_from_config_map() or self._get_svc_eip(
            svclabel="app=axops", namespace=AXNameSpaces.AXSYS)

        if not axops_eip:
            logger.error(
                "Platform Start Failed: cannot find External IP for AXOPS")
            raise AXPlatformException("AXOPS elastic IP does not exist")

        self.cluster_dns_name = axops_eip[0]
        # Don't change format of this message. Portal parses this line to get cluster IP/DNS.
        logger.info(
            "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n",
            COLOR_GREEN, self.cluster_dns_name, COLOR_NORM)
        self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name

    def get_cluster_external_dns(self):
        if not self.cluster_dns_name:
            self._set_ext_dns()
        return self.cluster_dns_name

    def _set_autoscaling(self):
        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if asg_name is not None:
            self._replacing["ASG_NAME"] = asg_name
        else:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

    # TODO (#157) Version should only be uploaded during install and upgrade time
    def _update_version(self):
        # Software info we get during install / upgrade does not contain ami id
        # need to persist it as well
        self._software_info.ami_id = self._cluster_config.get_ami_id()

        AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id,
                  self._aws_profile).update(self._software_info.to_dict())
    def update_cluster_config(self):
        """
        Upgrade the cluster config in S3 such that it has all required fields.
        """
        logger.info("Updating cluster config!")
        cluster_config = AXClusterConfig(cluster_name_id=self._cluster_name_id,
                                         aws_profile=self._profile)
        cluster_info = AXClusterInfo(cluster_name_id=self._cluster_name_id,
                                     aws_profile=self._profile)

        # Separate axsys / axuser config if needed
        update_node_config_key_needed = False
        try:
            # New cluster config is looking for "max_node_count" for this method and
            # should throw KeyError if the cluster config in s3 was the old one
            cluster_config.get_max_node_count()
        except KeyError:
            update_node_config_key_needed = True

        if update_node_config_key_needed:
            logger.info("Updating node config keys ...")
            # Parse old raw config directly
            minion_type = cluster_config._conf["cloud"]["configure"][
                "minion_type"]
            max_count = cluster_config._conf["cloud"]["configure"]["max_count"]
            min_count = cluster_config._conf["cloud"]["configure"]["min_count"]
            axsys_count = cluster_config._conf["cloud"]["configure"][
                "axsys_nodes"]

            # Remove all old keys
            for old_key in [
                    "minion_type", "max_count", "min_count", "axsys_nodes"
            ]:
                cluster_config._conf["cloud"]["configure"].pop(old_key, None)

            # Setting new keys
            cluster_config._conf["cloud"]["configure"][
                "axsys_node_count"] = axsys_count
            cluster_config._conf["cloud"]["configure"][
                "max_node_count"] = max_count
            cluster_config._conf["cloud"]["configure"][
                "min_node_count"] = min_count

            # All clusters that needs this upgrade has same node type for axsys and axuser
            cluster_config._conf["cloud"]["configure"][
                "axuser_node_type"] = minion_type
            cluster_config._conf["cloud"]["configure"][
                "axsys_node_type"] = minion_type
        else:
            logger.info("Node config keys are already up-to-date")

        # If cluster type is not set, default it to standard type
        if cluster_config.get_ax_cluster_type() == None:
            cluster_config._conf["cloud"]["configure"][
                "cluster_type"] = AXClusterType.STANDARD

        # Check and update Cluster user. Defaults to "customer"
        if cluster_config.get_ax_cluster_user() is None:
            cluster_config.set_ax_cluster_user('customer')

        # Check and update Cluster size. Defaults to "small"
        if cluster_config.get_ax_cluster_size() is None:
            max_count = cluster_config.get_max_node_count()
            if max_count == 5:
                cluster_size = "small"
            elif max_count == 10:
                cluster_size = "medium"
            elif max_count == 21:
                cluster_size = "large"
            elif max_count == 30:
                cluster_size = "xlarge"
            else:
                cluster_size = "small"
            cluster_config.set_ax_cluster_size(cluster_size)

        # Check and update AX Volume size. Note that this has to come *AFTER* the cluster_size is set.
        if cluster_config.get_ax_vol_size() is None:
            cluster_size = cluster_config.get_ax_cluster_size()
            if cluster_size in ("small", "medium"):
                vol_size = 100
            elif cluster_size == "large":
                vol_size = 200
            elif cluster_size == "xlarge":
                vol_size = 400
            else:
                vol_size = 100
            cluster_config.set_ax_vol_size(vol_size)

        # Ensure that we have 3 tiers now
        cluster_config.set_node_tiers("master/applatix/user")

        # set new ami id
        ami_name = os.getenv("AX_AWS_IMAGE_NAME")
        ami_id = AMI(
            aws_region=self._region,
            aws_profile=self._profile).get_ami_id_from_name(ami_name=ami_name)
        logger.info("Updating cluster config with ami %s", ami_id)
        cluster_config.set_ami_id(ami_id)

        cluster_config.save_config()
class ClusterInstaller(ClusterOperationBase):
    def __init__(self, cfg, kubeconfig=None):
        assert isinstance(cfg, ClusterInstallConfig)
        self._cfg = cfg
        super(ClusterInstaller,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             generate_name_id=True,
                             dry_run=self._cfg.dry_run)

        self._name_id = self._idobj.get_cluster_name_id()

        # Ensure cluster buckets before instantiating any class that uses cluster buckets
        # Note that AXClusterId object is an exception as we need to create cluster name_id
        # first, instantiating buckets, and finally upload cluster name id
        # TODO (#116) bucket initialization should not depend on cluster name id
        AXClusterBuckets(name_id=self._name_id,
                         aws_profile=self._cfg.cloud_profile,
                         aws_region=self._cfg.cloud_region).update()

        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        if kubeconfig:
            self._cluster_info.set_kube_config(kubeconfig)

    def pre_run(self):
        if self._csm.is_running():
            logger.info(
                "Cluster is already installed and running. Please ask your administrator"
            )
            sys.exit(0)
        self._csm.do_install()
        self._persist_cluster_state_if_needed()

    def post_run(self):
        self._csm.done_install()
        self._persist_cluster_state_if_needed()

    def persist_username_password_locally(self, username, password,
                                          cluster_dns):
        # Dump Argo cluster profile
        if username and password:
            logger.info("Generating Argo cluster profile ...")
            argo_config_path = ARGO_CONFIG.format(
                fname=self._idobj.get_cluster_name_id())
            with open(argo_config_path, "w") as f:
                f.write("""
insecure: true
password: {password}
url: https://{dns}
username: {username}
""".format(password=password, dns=cluster_dns, username=username))
            if not os.path.exists(ARGO_CONFIG_DEFAULT):
                # if user has not yet configured default argo config, symlink a default config to the one just created
                os.symlink(os.path.basename(argo_config_path),
                           ARGO_CONFIG_DEFAULT)

        summary = """
              Cluster Name:  {cluster_name}
                Cluster ID:  {cluster_id}
      Cluster Profile Name:  {name_id}
               Cluster DNS:  {dns}
          Initial Username:  {username}
          Initial Password:  {password}

Note if your username and password are empty, your cluster has already been successfully installed before.

In this case, your argo CLI profile is NOT configured, as we only generate initial username and password once,
please contact your administrator for more information to configure your argo CLI profile.
        """.format(cluster_name=self._idobj.get_cluster_name(),
                   cluster_id=self._idobj.get_cluster_id(),
                   name_id=self._name_id,
                   dns=cluster_dns,
                   username=username,
                   password=password)
        logger.info("Cluster information:\n%s%s%s\n", COLOR_GREEN, summary,
                    COLOR_NORM)

    def run(self):
        """
        Main install routine
        :return:
        """
        self._pre_install()
        self._ensure_kubernetes_cluster()
        if self._cfg.dry_run:
            logger.info("DRY RUN: not installing cluster")
            return
        cluster_dns, username, password = self._ensure_argo_microservices()

        self.persist_username_password_locally(username, password, cluster_dns)

    def _pre_install(self):
        """
        Pre install ensures the following stuff:
            - Cluster name/id mapping is created and uploaded
            - A local copy of cluster config is generated
            - Upload stage0 information to S3

        Stage0 is an indication of the fact that at least some part of the cluster could have been created.
        This step is idempotent.
        :return:
        """
        if check_cluster_staging(self._cluster_info, "stage0"):
            logger.info("Skip pre install")
            return

        logger.info("Cluster installation step: Pre Install")

        # After buckets are ensured, we persist cluster name id information
        # There is no problem of re-uploading we rerun this step
        self._idobj.upload_cluster_name_id()

        # Generate raw config dict
        raw_cluster_config_dict = self._generate_raw_cluster_config_dict()

        # Set cluster config object with raw cluster config dict
        self._cluster_config.set_config(raw_cluster_config_dict)

        # Prepare configuration for kube installer. This call will write
        prepare_kube_install_config(name_id=self._name_id,
                                    aws_profile=self._cfg.cloud_profile,
                                    cluster_info=self._cluster_info,
                                    cluster_config=self._cluster_config)

        # Save config file to s3, which is also stage0 information
        self._cluster_config.save_config()
        logger.info(
            "Cluster installation step: Pre Install successfully finished")

    def _ensure_kubernetes_cluster(self):
        """
        This step won't run if there is "--dry-run" specified.

        This step assumes pre-install is already finished. This step does the following:
            - Config kube-installer
            - Call kube-installer to create Kubernetes cluster
            - Persist cluster credentials (Kubeconfig file and ssh key) to S3
            - Upload finalized cluster config and cluster metadata to S3
            - Upload stage1 information to S3

        Stage1 is an indication of the fact that there is a kubernetes cluster ready, and we can
        create micro-services on it.

        This step is NOT necessarily idempotent:
        e.g., if you created master but install failed due to cloud provider rate limit, and as a result,
        you have not yet created minions, for some reasons you quit your cluster manager container, all
        your cluster credentials can be lost.

        So if this step fails, the safest way is to uninstall the half installed cluster and start another install
        :return:
        """
        if check_cluster_staging(self._cluster_info, "stage1"):
            logger.info("Skip ensure Kubernetes cluster")
            return

        logger.info("Cluster installation step: Ensure Kubernetes Cluster")

        # Reload config in case stage0 is skipped
        self._cluster_config.reload_config()
        logger.info("Creating cluster with config: \n\n%s\n",
                    pformat(self._cluster_config.get_raw_config()))

        # if dry-run is specified, this step should be skipped
        if self._cfg.dry_run:
            return

        # Call kube-up
        logger.info("\n\n%sCalling kube-up ...%s\n", COLOR_GREEN, COLOR_NORM)
        AXKubeUpDown(cluster_name_id=self._name_id,
                     env=self._cluster_config.get_kube_installer_config(),
                     aws_profile=self._cfg.cloud_profile).up()

        # kube-up will generate cluster metadata. We add information from cluster metadata into cluster config
        logger.info("Loading cluster meta into cluster config ...")
        with open(CLUSTER_META_DATA_PATH, "r") as f:
            data = f.read()
        cluster_meta = yaml.load(data)
        self._cluster_config.load_cluster_meta(cluster_meta)

        # Persist updated cluster config
        self._cluster_config.save_config()

        # Upload cluster metadata
        self._cluster_info.upload_cluster_metadata()

        # Finally persist stage1
        self._cluster_info.upload_staging_info(stage="stage1", msg="stage1")

        logger.info(
            "Cluster installation step: Ensure Kubernetes Cluster successfully finished"
        )

    def install_and_run_platform(self):
        logger.info("Starting platform install")

        # Install Argo micro-services
        # Platform install
        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=self._cfg.manifest_root,
                              config_file=self._cfg.bootstrap_config)

        install_platform_failed = False
        install_platform_failure_message = ""
        try:
            platform.start()
            platform.stop_monitor()
        except Exception as e:
            logger.exception(e)
            install_platform_failed = True
            install_platform_failure_message = str(
                e
            ) + "\nPlease manually check the cluster status and retry installation with same command if the error is transient."

        if install_platform_failed:
            raise RuntimeError(install_platform_failure_message)

        # In case platform is successfully installed,
        # connect to axops to get initial username and password
        username, password = self._get_initial_cluster_credentials()

        logger.info("Done with platform install")
        return platform.cluster_dns_name, username, password

    def post_install(self):
        # Persist manifests to S3
        self._cluster_info.upload_platform_manifests_and_config(
            platform_manifest_root=self._cfg.manifest_root,
            platform_config=self._cfg.bootstrap_config)

        # Finally persist stage2 information
        self._cluster_info.upload_staging_info(stage="stage2", msg="stage2")
        logger.info(
            "Cluster installation step: Ensure Argo Micro-services successfully finished"
        )

    def _ensure_argo_microservices(self):
        """
        This step won't run if there is "--dry-run" specified.

        This step assumes there is a running Kubernetes cluster. This step does the following:
            - ensure ASG count
            - ensure trusted CIDRs
            - install Argo software on to the cluster and make sure they are up and running (We don't monitor
               if the microservice is having a crash loop)
            - Remove manager CIDR if it is not part of user-specified trusted CIDRs
            - Upload stage2 information to S3

        Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and
        all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop
        This step is idempotent
        :return: cluster_dns_name, username, password
        """
        logger.info("Cluster installation step: Ensure Argo Micro-services")

        # Reload config in case stage0 and stage1 are skipped
        self._cluster_config.reload_config()

        trusted_cidrs = self._cluster_config.get_trusted_cidr()

        # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to
        # perform using that object.
        axbootstrap = AXBootstrap(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile,
                                  region=self._cluster_config.get_region())

        # We allow access from everywhere during installation phase, but will remove this access
        # if user does not specify 0.0.0.0/0 as their trusted CIDR
        axbootstrap.modify_node_security_groups(old_cidr=[],
                                                new_cidr=trusted_cidrs +
                                                [EC2IPPermission.AllIP],
                                                action_name="allow-creator")

        if check_cluster_staging(self._cluster_info, "stage2"):
            # TODO: some duplicated logic here, might need to combine them.
            logger.info(
                "Skip ensure Argo micro-services since cluster has already been successfully installed"
            )
            platform = AXPlatform(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile)
            if EC2IPPermission.AllIP not in trusted_cidrs:
                axbootstrap.modify_node_security_groups(
                    old_cidr=[EC2IPPermission.AllIP],
                    new_cidr=[],
                    action_name="disallow-creator")
            return platform.get_cluster_external_dns(), "", ""

        # Modify ASG
        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = int(
            self._cluster_config.get_min_node_count()) - axsys_node_count
        axuser_max_count = int(
            self._cluster_config.get_max_node_count()) - axsys_node_count
        axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count)

        cluster_dns, username, password = self.install_and_run_platform()

        self.post_install()

        # Remove access from 0.0.0.0/0 if this is not what user specifies
        if EC2IPPermission.AllIP not in trusted_cidrs:
            axbootstrap.modify_node_security_groups(
                old_cidr=[EC2IPPermission.AllIP],
                new_cidr=[],
                action_name="disallow-creator")

        return cluster_dns_name, username, password

    @retry(wait_fixed=5, stop_max_attempt_number=10)
    def _get_initial_cluster_credentials(self):
        """
        This functions connects to axops pod to get cluster's initial credentials
        :return: (username, password)
        """
        # TODO: a less hacky way of getting initial credentials?
        ns_conf = "--namespace axsys --kubeconfig {config}".format(
            config=self._cluster_info.get_kube_config_file_path())
        cmd = "kubectl " + ns_conf + " exec $(kubectl " + ns_conf + " get pods -l app=axops-deployment | grep axops | awk '{print $1}') /axops/bin/axpassword -c axops"
        ret = subprocess.check_output(cmd, shell=True)
        username = None
        password = None
        for line in ret.split("\n"):
            if line.startswith("Username"):
                # Username line has format "Username: xxxxxxx"
                username = line[len("Username: "******"Password"):
                # Password line has format "Password: xxxxxx"
                password = line[len("Password: "******"Failed to get username and password from axops pod: {}".format(
            ret)
        return username, password

    def _generate_raw_cluster_config_dict(self):
        """
        This is a standalone method to generate cluster config dictionary based on install config. We might want to
        move it to ax.platform.cluster_config package for sanity
        :return:
        """
        config_file_name = CLUSTER_CONFIG_TEMPLATES[self._cfg.cluster_size]
        config_file_full_path = os.path.join(
            *[CLUSTER_CONFIG_ROOT, self._cfg.cluster_type, config_file_name])
        with open(config_file_full_path, "r") as f:
            config = json.load(f)

        if Cloud().target_cloud_aws():
            return self._generate_raw_cluster_config_dict_aws(config)
        elif Cloud().target_cloud_gcp():
            return self._generate_raw_cluster_config_dict_gcp(config)
        else:
            # Should never come here as aws/gcp is ensured at CLI validation level
            return config

    def _generate_raw_cluster_config_dict_aws(self, config):
        """
        Generate AWS specific cluster config.
        :param config:
        :return:
        """
        # TODO: once we support installing with config file, we only overwrite when item is specifically set through CLI
        config["cloud"]["configure"]["region"] = self._cfg.cloud_region
        config["cloud"]["configure"]["placement"] = self._cfg.cloud_placement
        config["cloud"]["trusted_cidr"] = self._cfg.trusted_cidrs
        config["cloud"]["vpc_id"] = self._cfg.vpc_id

        # If we install into existing VPC, i.e. vpc_id is not None, or we are going to fetch it
        # from cluster metadata after cluster is created.
        config["cloud"][
            "vpc_cidr_base"] = self._cfg.vpc_cidr_base if not self._cfg.vpc_id else None
        config["cloud"]["subnet_size"] = self._cfg.subnet_mask_size
        config["cloud"]["configure"][
            "sandbox_enabled"] = self._cfg.enable_sandbox

        # TODO (#119): might want to remove this filed as this was used for hacks before. Setting it to "dev" for now
        config["cloud"]["configure"]["cluster_user"] = "******"

        # TODO (#117): Switch all spot related options by literals rather than true/false and some other hacks
        # also need to revise the need of specifying a spot price during installation
        if self._cfg.spot_instances_option in [
                SpotInstanceOption.PARTIAL_SPOT, SpotInstanceOption.ALL_SPOT
        ]:
            spot_instances_enabled = "true"
        else:
            spot_instances_enabled = "false"
        config["cloud"]["configure"][
            "spot_instances_enabled"] = spot_instances_enabled
        config["cloud"]["configure"][
            "spot_instances_option"] = self._cfg.spot_instances_option
        config["cloud"]["node_spot_price"] = DEFAULT_NODE_SPOT_PRICE

        # Configure master
        axsys_node_type = config["cloud"]["configure"]["axsys_node_type"]
        axsys_node_max = config["cloud"]["configure"]["axsys_node_count"]
        axuser_node_type = config["cloud"]["configure"]["axuser_node_type"]
        axuser_node_max = config["cloud"]["configure"][
            "max_node_count"] - axsys_node_max
        cluster_type = config["cloud"]["configure"]["cluster_type"]
        if self._cfg.cluster_size != AXClusterSize.CLUSTER_USER_PROVIDED:
            master_config = KubeMasterResourceConfig(
                usr_node_type=axuser_node_type,
                usr_node_max=axuser_node_max,
                ax_node_type=axsys_node_type,
                ax_node_max=axsys_node_max,
                cluster_type=cluster_type)
            if self._cfg.cluster_size == AXClusterSize.CLUSTER_MVC:
                # MVC cluster does not follow the heuristics we used to configure master
                config["cloud"]["configure"]["master_type"] = "m3.xlarge"
            else:
                config["cloud"]["configure"][
                    "master_type"] = master_config.master_instance_type
            config["cloud"]["configure"][
                "master_config_env"] = master_config.kube_up_env

        # TODO (#121) Need to revise the relationship between user_on_demand_nodes and node minimum, system node count
        config["cloud"]["configure"][
            "axuser_on_demand_nodes"] = self._cfg.user_on_demand_nodes

        # Get AMI information
        ami_name = self._cfg.software_info.ami_name
        ami_id = AMI(aws_profile=self._cfg.cloud_profile,
                     aws_region=self._cfg.cloud_region).get_ami_id_from_name(
                         ami_name=ami_name)
        config["cloud"]["configure"]["ami_id"] = ami_id

        # Other configurations
        config["cloud"]["configure"]["autoscaler_scan_interval"] = str(
            self._cfg.autoscaling_interval) + "s"
        config["cloud"]["configure"]["support_object_store_name"] = str(
            self._cfg.support_object_store_name)

        return config

    def _generate_raw_cluster_config_dict_gcp(self, config):
        """
        Generate GCP specific cluster config.
        :param config:
        :return:
        """
        config["cloud"]["trusted_cidr"] = self._cfg.trusted_cidrs
        return config

    def update_and_save_config(self, cluster_bucket=None):
        """
        Update the config to use the given bucket and upload cluster_config and kubeconfig
        to the given bucket.
        """
        raw_cluster_config_dict = self._generate_raw_cluster_config_dict()
        self._cluster_config.set_config(raw_cluster_config_dict)
        self._cluster_config.set_cluster_provider(ClusterProvider.USER)
        self._cluster_config.set_support_object_store_name(cluster_bucket)

        # Save config file to s3.
        self._cluster_config.save_config()

        self._cluster_info.upload_kube_config()
Beispiel #4
0
class AXSYSKubeYamlUpdater(object):
    """
    This class loads a kubernetes yaml file, updates resource,
    and generate objects that kube_object.py can consume
    """
    def __init__(self, config_file_path):
        assert os.path.isfile(
            config_file_path), "Config file {} is not a file".format(
                config_file_path)
        self._config_file = config_file_path
        self._cluster_name_id = AXClusterId().get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id)
        self.cpu_mult, self.mem_mult, self.disk_mult, \
            self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers()
        self._swagger_components = []
        self._yaml_components = []
        self._updated_raw = ""

        # TODO: when we support config software info using a config file, need to figure out how that
        # file gets passed through, since SoftwareInfo is not a singleton
        self._software_info = SoftwareInfo()

        self._load_objects()
        self._load_raw()

    @property
    def updated_raw(self):
        return self._updated_raw

    @property
    def components_in_dict(self):
        return self._yaml_components

    @property
    def components_in_swagger(self):
        return self._swagger_components

    def _load_objects(self):
        with open(self._config_file, "r") as f:
            data = f.read()
        for c in yaml.load_all(data):
            swagger_obj = self._config_yaml(c)
            yaml_obj = ApiClient().sanitize_for_serialization(swagger_obj)
            self._swagger_components.append(swagger_obj)
            self._yaml_components.append(yaml_obj)

    def _load_raw(self):
        self._updated_raw = yaml.dump_all(self._yaml_components)

    def _get_resource_multipliers(self):
        """
        Resources in yaml templates need to be multiplied with these numbers
        :return: cpu_multiplier, mem_multiplier, disk_multiplier
        """
        # Getting cluster size from cluster config, in order to configure resources
        # There are 3 situations we will be using AXClusterConfig
        #   - During install, since the class is a singleton, it has all the values we need
        #     no need to download from s3
        #   - During upgrade, since we are exporting AWS_DEFAULT_PROFILE, we can download
        #     cluster config files from s3 to get the values
        #   - During job creation: the node axmon runs has the proper roles to access s3

        try:
            ax_node_max = int(self._cluster_config.get_asxys_node_count())
            ax_node_type = self._cluster_config.get_axsys_node_type()
            usr_node_max = int(
                self._cluster_config.get_max_node_count()) - ax_node_max
            usr_node_type = self._cluster_config.get_axuser_node_type()
            assert all(
                [ax_node_max, ax_node_type, usr_node_max, usr_node_type])
        except Exception as e:
            logger.error(
                "Unable to read cluster config, skip resource config for %s. Error %s",
                self._config_file, e)
            return 1, 1, 1, 1, 1

        rc = AXSYSResourceConfig(
            ax_node_type=ax_node_type,
            ax_node_max=ax_node_max,
            usr_node_type=usr_node_type,
            usr_node_max=usr_node_max,
            cluster_type=self._cluster_config.get_ax_cluster_type())
        #logger.info("With %s %s axsys nodes, %s %s axuser nodes, component %s uses multipliers (%s, %s, %s, %s, %s)",
        #            ax_node_max, ax_node_type, usr_node_max, usr_node_type, self._config_file,
        #            rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier,
        #            rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier)
        return rc.cpu_multiplier, rc.mem_multiplier, rc.disk_multiplier, rc.daemon_cpu_multiplier, rc.daemon_mem_multiplier

    def _config_yaml(self, kube_yaml_obj):
        """
        Load dict into swagger object, patch resource,
        sanitize, return a dict
        :param kube_yaml_obj:
        :return: swagger object with resource values finalized
        """
        kube_kind = kube_yaml_obj["kind"]
        (swagger_class_literal,
         swagger_instance) = KubeKindToV1KubeSwaggerObject[kube_kind]
        swagger_obj = ApiClient()._ApiClient__deserialize(
            kube_yaml_obj, swagger_class_literal)
        assert isinstance(swagger_obj, swagger_instance), \
            "{} has instance {}, expected {}".format(swagger_obj, type(swagger_obj), swagger_instance)

        if isinstance(swagger_obj, V1beta1Deployment):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None

            node_selector = swagger_obj.spec.template.spec.node_selector
            if node_selector.get('ax.tier', 'applatix') == 'master':
                # Skip updating containers on master.
                logger.info(
                    "Skip updating cpu, mem multipliers for pods on master: %s",
                    swagger_obj.metadata.name)
            else:
                for container in swagger_obj.spec.template.spec.containers:
                    self._update_container(container)
            return swagger_obj
        elif isinstance(swagger_obj, V1Pod):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.image_pull_secrets = None
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1DaemonSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            for container in swagger_obj.spec.template.spec.containers:
                # We are special-casing applet DaemonSet to compromise the fact that
                # we are using different node type for compute-intense nodes
                if swagger_obj.metadata.name == "applet":
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=True)
                else:
                    self._update_container(container=container,
                                           is_daemon=True,
                                           update_resource=False)
            return swagger_obj
        elif isinstance(swagger_obj, V1beta1StatefulSet):
            if not self._software_info.registry_is_private():
                swagger_obj.spec.template.spec.image_pull_secrets = None
            return self._update_statefulset(swagger_obj)
        elif isinstance(swagger_obj, V1PersistentVolumeClaim):
            self._update_volume(swagger_obj)
            return swagger_obj
        else:
            # logger.info("Object %s does not need to configure resource", type(swagger_obj))
            # HACK, as the original hook will be messed up
            if isinstance(swagger_obj, V1Service):
                if swagger_obj.metadata.name == "axops":
                    swagger_obj.spec.load_balancer_source_ranges = []
                    for cidr in self._cluster_config.get_trusted_cidr():
                        # Seems swagger client does not support unicode ... SIGH
                        swagger_obj.spec.load_balancer_source_ranges.append(
                            str(cidr))

                # HACK #2: if we don't do this, kubectl will complain about something such as
                #
                # spec.ports[0].targetPort: Invalid value: "81": must contain at least one letter (a-z)
                #
                # p.target_port is defined as string though, but if its really a string, kubectl
                # is looking for a port name, rather than a number
                # SIGH ...
                for p in swagger_obj.spec.ports or []:
                    try:
                        p.target_port = int(p.target_port)
                    except (ValueError, TypeError):
                        pass
            return swagger_obj

    def _update_deployment_or_daemonset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1Deployment) or isinstance(
            kube_obj, V1beta1DaemonSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        return kube_obj

    def _update_statefulset(self, kube_obj):
        assert isinstance(kube_obj, V1beta1StatefulSet)
        for container in kube_obj.spec.template.spec.containers:
            self._update_container(container)
        if isinstance(kube_obj.spec.volume_claim_templates, list):
            for vol in kube_obj.spec.volume_claim_templates:
                self._update_volume(vol)
        return kube_obj

    def _update_container(self,
                          container,
                          is_daemon=False,
                          update_resource=True):
        assert isinstance(container, V1Container)

        if update_resource:
            cpulim = container.resources.limits.get("cpu")
            memlim = container.resources.limits.get("memory")
            cpureq = container.resources.requests.get("cpu")
            memreq = container.resources.requests.get("memory")

            def _massage_cpu(orig):
                return orig * self.daemon_cpu_mult if is_daemon else orig * self.cpu_mult

            def _massage_mem(orig):
                return orig * self.daemon_mem_mult if is_daemon else orig * self.mem_mult

            if cpulim:
                rvc = ResourceValueConverter(value=cpulim, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.limits["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if cpureq:
                rvc = ResourceValueConverter(value=cpureq, target="cpu")
                rvc.massage(_massage_cpu)
                container.resources.requests["cpu"] = "{}m".format(
                    rvc.convert("m"))
            if memlim:
                rvc = ResourceValueConverter(value=memlim, target="memory")
                rvc.massage(_massage_mem)
                container.resources.limits["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))
            if memreq:
                rvc = ResourceValueConverter(value=memreq, target="memory")
                rvc.massage(_massage_mem)
                container.resources.requests["memory"] = "{}Mi".format(
                    int(rvc.convert("Mi")))

        if container.liveness_probe and container.liveness_probe.http_get:
            try:
                container.liveness_probe.http_get.port = int(
                    container.liveness_probe.http_get.port)
            except (ValueError, TypeError):
                pass
        if container.readiness_probe and container.readiness_probe.http_get:
            try:
                container.readiness_probe.http_get.port = int(
                    container.readiness_probe.http_get.port)
            except (ValueError, TypeError):
                pass

        # Add resource multiplier to containers in case we need them
        if not container.env:
            container.env = []
        container.env += self._generate_default_envs(is_daemon,
                                                     update_resource)

    def _update_volume(self, vol):
        assert isinstance(vol, V1PersistentVolumeClaim)
        vol_size = vol.spec.resources.requests["storage"]

        def _massage_disk(orig):
            return orig * self.disk_mult

        if vol_size:
            rvc = ResourceValueConverter(value=vol_size, target="storage")
            rvc.massage(_massage_disk)
            # Since AWS does not support value such as 1.5G, lets round up to its ceil
            vol.spec.resources.requests["storage"] = "{}Gi".format(
                int(ceil(rvc.convert("Gi"))))

        # Manually patch access mode as swagger client mistakenly interprets this as map
        vol.spec.access_modes = ["ReadWriteOnce"]

    def _generate_default_envs(self, is_daemon, resource_updated):
        """
        Add essential variables to all system containers
        :param is_daemon:
        :return:
        """
        default_envs = [
            # Kubernetes downward APIs
            {
                "name": "AX_NODE_NAME",
                "path": "spec.nodeName"
            },
            {
                "name": "AX_POD_NAME",
                "path": "metadata.name"
            },
            {
                "name": "AX_POD_NAMESPACE",
                "path": "metadata.namespace"
            },
            {
                "name": "AX_POD_IP",
                "path": "status.podIP"
            },

            # Values
            {
                "name": "DISK_MULT",
                "value": str(self.disk_mult)
            },
            {
                "name": "AX_TARGET_CLOUD",
                "value": Cloud().target_cloud()
            },
            {
                "name": "AX_CLUSTER_NAME_ID",
                "value": self._cluster_name_id
            },
            {
                "name": "AX_CUSTOMER_ID",
                "value": AXCustomerId().get_customer_id()
            },
        ]

        # Special cases for daemons
        if is_daemon:
            if resource_updated:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": str(self.daemon_cpu_mult)
                    },
                    {
                        "name": "MEM_MULT",
                        "value": str(self.daemon_mem_mult)
                    },
                ]
            else:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": "1.0"
                    },
                    {
                        "name": "MEM_MULT",
                        "value": "1.0"
                    },
                ]
        else:
            default_envs += [
                {
                    "name": "CPU_MULT",
                    "value": str(self.cpu_mult)
                },
                {
                    "name": "MEM_MULT",
                    "value": str(self.mem_mult)
                },
            ]

        rst = []
        for d in default_envs:
            var = V1EnvVar()
            var.name = d["name"]

            if d.get("path", None):
                field = V1ObjectFieldSelector()
                field.field_path = d["path"]
                src = V1EnvVarSource()
                src.field_ref = field
                var.value_from = src
            else:
                var.value = d["value"]
            rst.append(var)
        return rst