Example #1
0
class AXPlatform(object):
    def __new__(cls, *args, **kwargs):
        if Cloud().target_cloud_gcp():
            from .gke_platform import AXGKEPlatform
            return super(AXPlatform, cls).__new__(AXGKEPlatform)
        else:
            return super(AXPlatform, cls).__new__(cls)

    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}

    def _load_kube_objects_from_steps(self, steps):
        """
        Extract kube objects from steps in config, and load them into memory
        :param steps: list
        :return:
        """
        for object_group in steps:
            assert isinstance(object_group, AXPlatformObjectGroup)
            for obj in object_group.object_set:
                assert isinstance(obj, AXPlatformObject)
                name = obj.name
                filename = obj.manifest
                namespace = obj.namespace
                if name in self._kube_objects:
                    raise ValueError("Duplicated object name {}".format(name))
                kubeobj_conf_path = os.path.join(self._manifest_root, filename)
                self._kube_objects[name] = KubeObject(
                    config_file=kubeobj_conf_path,
                    kubepoll=self.kube_poll,
                    replacing=None,
                    kube_config=self._kube_config,
                    kube_namespace=namespace)

    def _generate_replacing(self):
        # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method
        # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and
        # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid
        # yaml files, and therefore we construct string manually here
        trusted_cidr = self._cluster_config.get_trusted_cidr()
        if isinstance(trusted_cidr, list):
            trusted_cidr_str = "["
            for cidr in trusted_cidr:
                trusted_cidr_str += "\"{}\",".format(str(cidr))
            trusted_cidr_str = trusted_cidr_str[:-1]
            trusted_cidr_str += "]"
        else:
            trusted_cidr_str = "[{}]".format(trusted_cidr)

        axsys_cpu = 0
        axsys_mem = 0
        daemon_cpu = 0
        daemon_mem = 0
        for name in self._kube_objects.keys():
            cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage
            axsys_cpu += cpu
            axsys_mem += mem
            daemon_cpu += dcpu
            daemon_mem += dmem

        # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not
        # have a memory request, but this is an approximation)
        daemon_cpu += 100
        daemon_mem += 100

        logger.info(
            "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi",
            axsys_cpu, axsys_mem, daemon_cpu, daemon_mem)

        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = str(
            int(self._cluster_config.get_min_node_count()) - axsys_node_count)
        axuser_max_count = str(
            int(self._cluster_config.get_max_node_count()) - axsys_node_count)
        autoscaler_scan_interval = str(
            self._cluster_config.get_autoscaler_scan_interval())

        usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["cpu"]
        usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["memory"]
        scale_down_util_thresh = round(
            max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001
        logger.info("Setting node scale down utilization threshold to %s",
                    scale_down_util_thresh)

        self._persist_node_resource_rsvp(daemon_cpu, daemon_mem)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if not asg_name:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

        # Prepare minion-manager.
        spot_instances_option = self._cluster_config.get_spot_instances_option(
        )
        minion_manager_asgs = ""
        if spot_instances_option == SpotInstanceOption.ALL_SPOT:
            for asg in asg_manager.get_all_asgs():
                minion_manager_asgs = minion_manager_asgs + asg[
                    "AutoScalingGroupName"] + " "
            minion_manager_asgs = minion_manager_asgs[:-1]
        elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT:
            minion_manager_asgs = asg_manager.get_variable_asg(
            )["AutoScalingGroupName"]

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "ASG_MIN":
            axuser_min_count,
            "ASG_MAX":
            axuser_max_count,
            "AUTOSCALER_SCAN_INTERVAL":
            autoscaler_scan_interval,
            "SCALE_DOWN_UTIL_THRESH":
            str(scale_down_util_thresh),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "ASG_NAME":
            asg_name,
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "AX_ENABLE_SPOT_INSTANCES":
            str(spot_instances_option != SpotInstanceOption.NO_SPOT),
            "AX_SPOT_INSTANCE_ASGS":
            minion_manager_asgs,
        }

    def _persist_node_resource_rsvp(self, user_node_daemon_cpu,
                                    user_node_daemon_mem):
        self._cluster_config.set_user_node_resource_rsvp(
            cpu=user_node_daemon_cpu, mem=user_node_daemon_mem)
        self._cluster_config.save_config()

    def start(self):
        """
        Bring up platform using "platform-start.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects
        steps = self._config.steps
        self._load_kube_objects_from_steps(steps)
        self._replacing = self._generate_replacing()

        # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32)
        # At this moment, we MUST separate first step due to the above dependency
        assert len(steps) >= 2, "Should have at least 1 step to create axops"
        self.create_objects(steps[0])
        self.create_objects(steps[1])
        self.create_objects(steps[2])

        # Prepare axops_eip
        self._set_ext_dns()

        logger.debug("Replacing ENVs: %s", self._replacing)

        info_bound = "=======================================================\n"
        img_namespace = "Image Namespace: {}\n".format(
            self._software_info.image_namespace)
        img_version = "Image Version: {}\n".format(
            self._software_info.image_version)
        start_info = "\n\n{}{}{}{}{}".format(
            info_bound, "Platform Up: Bringing up Argo services...\n",
            img_namespace, img_version, info_bound)
        logger.info(start_info)

        # Start rest of the objects
        for i in range(3, len(steps)):
            self.create_objects(steps[i])

        # update application namespace
        logger.info("Updating application managers")
        for app in Applications(client=self.kubectl).list():
            logger.info("--- updating {}".format(app))
            a = Application(app, client=self.kubectl)
            a.create(force_recreate=True)
        logger.info("Done updating application managers")

        # Upload version information to target cluster
        self._update_version()
        logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n",
                    COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name,
                    COLOR_NORM)

    def stop(self):
        """
        Bring down platform using "platform-stop.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects (Does not need to generate replacing during platform down)
        # Stop order should be the reverse of start
        steps = self._config.steps
        steps.reverse()
        self._load_kube_objects_from_steps(steps)

        info_bound = "=======================================================\n"
        stop_info = "\n\n{}{}{}".format(
            info_bound, "Platform Down: Shutting down Argo services...\n",
            info_bound)
        logger.info(stop_info)

        # Bring down objects according to steps
        for i in range(len(steps)):
            object_group = steps[i]
            self.delete_objects(object_group)

    def stop_monitor(self):
        self._monitor.stop()

    def create_objects(self, objects):
        """
        Start kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_create_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate,
                consistency=objects.consistency):
            logger.debug(
                "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)",
                objects.name, objects.policy, objects.policy_predicate,
                objects.consistency)
            return
        logger.info("Create step: %s", objects.name)
        logger.info("Creating platform objects\n\n%s",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.start_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Create")
        logger.info(report)

        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_create_group(self, policy, policy_predicate, consistency):
        """
        Take AXPlatformObjectGroup policy, predicate and consistency and determine
        if this group should be created or not
        :param policy:
        :param policy_predicate:
        :param consistency:
        :return:
        """
        # Since we are not using consistency, we should always create if not
        # explicitly told not to, i.e. if there is a PrivateRegistryOnly
        # We are just leaving the interface here that should create or not
        # need to be decided by policy, policy_predicate and consistency

        if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \
                not self._software_info.registry_is_private():
            return False
        return True

    def delete_objects(self, objects):
        """
        Stop kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_delete_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate):
            logger.debug(
                "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)",
                objects.name, objects.policy, objects.policy_predicate)
            return
        logger.info("Delete step: %s", objects.name)
        logger.info("Deleting platform objects\n\n%s.",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.stop_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Delete")
        logger.info(report)
        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_delete_group(self, policy, policy_predicate):
        """
        Take AXPlatformObjectGroup policy and determine if this group should be deleted or not.
        Consistency is not needed
        for deletion

        :param policy:
        :param policy_predicate:
        :return:
        """
        if policy == ObjectGroupPolicy.CreateMany:
            return True
        return False

    def start_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Creating %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]

        # Update them as there are new updates in replacing in platform start
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing

        assert isinstance(kube_obj, KubeObject)
        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }
        if kube_obj.healthy():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        # Previous platform start might fail, and might result in some componenets created
        # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing
        # object and try to create a new one
        if kube_obj.exists():
            logger.warning(
                "Object %s exists but not healthy. Deleting object for idempotency ...",
                name)
            self.stop_one(name, namespace)

        assert not kube_obj.exists(
        ), "Kubeobject {} already created but is not healthy. Not Expected".format(
            name)

        monitor_info = kube_obj.get_create_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout,
                    waiter)

            # Call kubectl create
            kube_obj.create()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                result["code"].append("{:.25s}:{}".format(
                    wait_info["name"], waiter.result))
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    logger.info("Successfully created %s with code %s.",
                                wait_info["name"], waiter.result)
                else:
                    result["failed"] = True
                    logger.error(
                        "Failed to create %s in %s with code %s. Events: %s",
                        wait_info["name"], namespace, waiter.result,
                        str(waiter.details))
                    if not self._debug:
                        logger.info("Deleting %s due to creation failure",
                                    name)
                        del_rst = self.stop_one(name, namespace)
                        result["code"] += del_rst["code"]
                        result["events"] += del_rst["events"]
                        result["duration"] = str(round(time.time() - start, 2))
                        return result

            # Poll extra if required (for Petset and Deployments with multiple replicas)
            if kube_obj.extra_poll:
                logger.info(
                    "Polling till healthy to make sure rest of components of %s are up and running ...",
                    name)
                create_rst = self._poll_till_healthy(
                    name=name,
                    kube_obj=kube_obj,
                    start_time=start,
                    poll_interval=AXPlatformConfigDefaults.
                    ObjCreateExtraPollInterval,
                    poll_max_retry=AXPlatformConfigDefaults.
                    ObjCreateExtraPollMaxRetry,
                    rst=result)
                if create_rst["failed"] and not self._debug:
                    logger.info("Deleting %s due to creation failure", name)
                    del_rst = self.stop_one(name, namespace)
                    create_rst["code"] += del_rst["code"]
                    create_rst["events"] += del_rst["events"]
                    create_rst["duration"] = str(round(time.time() - start, 2))
                return create_rst

            # Poll once to confirm all components from this Kubernetes config file exist,
            # In case there are objects in this config file cannot be monitored, i.e. svc
            # without elb. This is really not expected so we don't delete it
            if not kube_obj.healthy():
                logger.error(
                    "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.",
                    name)
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNHEALTHY))
                result["failed"] = True
                result["events"].append(
                    "Object {} created byt is not healthy".format(name))
            result["duration"] = str(round(time.time() - start, 2))

            if not result["failed"]:
                logger.info("Successfully created object %s.", name)
            return result
        else:
            # use polling
            kube_obj.create()
            create_rst = self._poll_till_healthy(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry,
                rst=result)
            if create_rst["failed"] and not self._debug:
                logger.info("Deleting %s due to creation failure", name)
                del_rst = self.stop_one(name, namespace)
                create_rst["code"] += del_rst["code"]
                create_rst["events"] += del_rst["events"]
                create_rst["duration"] = str(round(time.time() - start, 2))
            return create_rst

    @staticmethod
    def _poll_till_healthy(name, kube_obj, start_time, poll_interval,
                           poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if not kube_obj.healthy():
                trail += 1
                if trail > poll_max_retry:

                    logger.error("Failed to create KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY)
                    ]
                    rst["events"] += [
                        "Object {} creation timeout. Not healthy".format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully created %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.OK)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def stop_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Deleting %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing
        assert isinstance(kube_obj, KubeObject)

        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }

        # Don't delete if object does not exist
        if not kube_obj.exists():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        monitor_info = kube_obj.get_delete_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout,
                    waiter)

            # Call kubectl delete
            kube_obj.delete()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.DELETED))
                    logger.info("Successfully deleted %s in %s with code %s.",
                                wait_info["name"], name, result["code"])
                else:
                    result["failed"] = True
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.UNKNOWN))
                    logger.error(
                        "Failed to delete %s in %s with code %s. Events: %s",
                        wait_info["name"], name, result["code"],
                        str(waiter.details))

            # Poll once to confirm all components from this Kubenetes config file exist
            # In case there are objects in this config file cannot be monitored, i.e. svc without elb
            if kube_obj.exists():
                logger.error("Object %s deleted but still exists", name)
                result["failed"] = True
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNKNOWN))
                result["events"].append(
                    "Object {} deleted but still exists.".format(name))
            result["duration"] = str(round(time.time() - start, 2))
            logger.info("Successfully deleted %s.", name)
            return result
        else:
            # use polling
            kube_obj.delete()
            return self._poll_till_not_exists(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry,
                rst=result)

    @staticmethod
    def _poll_till_not_exists(name, kube_obj, start_time, poll_interval,
                              poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if kube_obj.exists():
                trail += 1
                if trail > poll_max_retry:
                    logger.error("Failed to delete KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN)
                    ]
                    rst["events"] += [
                        "Object {} deletion timeout. Please manually check remaining pods"
                        .format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully deleted %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def _generate_object_summary(self, objects):
        """
        :param objects: list of AXPlatformObject
        :return:
        """
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "MANIFEST", "NAMESPACE")
        report_bar = "{}\n".format("-" * 174)
        content = ""
        for obj in objects:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            filename = os.path.join(self._manifest_root, obj.manifest)
            namespace = obj.namespace
            content += "{:25s} |  {:110s} |  {:20s}\n".format(
                name, filename, namespace)

        return report_title + report_bar + content

    @staticmethod
    def _generate_report(results, operation):
        failed = False
        report_body = ""
        warnings = "\n======= WARNING EVENTS =======\n"
        for name in results.keys():
            individual_report = "{:25s} |  {:110s} |  {:20s}\n"
            individual_warning = "{name}: {events}\n\n"
            try:
                result = results[name].get()
                if result["failed"]:
                    failed = True
                code = result["code"][0]
                for c in result["code"][1:]:
                    code += " / {}".format(c)
                individual_report = individual_report.format(
                    name, code, result["duration"], 2)
                if len(result["events"]) > 0:
                    warnings += individual_warning.format(
                        name=name, events=str(result["events"]))
            except Exception as e:
                failed = True
                logger.exception(str(e))
                individual_report = individual_report.format(
                    name, "EXCEPTION", "UNKNOWN")
                warnings += individual_warning.format(name=name, events=str(e))
            report_body += individual_report

        report_head = "\n\nPlatform {} {}. Report:\n".format(
            operation, "FAILED" if failed else "SUCCESSFULLY")
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "STATUS", "TIME (sec)")
        report_bar = "{}\n".format("-" * 174)
        return "{}{}{}{}{}{}".format(
            report_head, report_title, report_bar, report_body, warnings,
            "==============================\n"), failed

    def _get_eip_from_config_map(self):
        try:
            cmd = [
                "kubectl", "get", "configmap", "cluster-dns-name", "-o",
                "yaml", "--namespace", self.kube_axsys_namespace,
                "--kubeconfig", self._kube_config
            ]
            out = subprocess.check_output(cmd)
            return [yaml.load(out)["data"]["cluster-external-dns-name"]]
        except Exception:
            logger.error("Failed to get cluster dns name from config map.")
            return None

    def _get_svc_eip(self, svclabel, namespace):
        svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace,
                                                  svclabel)
        assert len(
            svc.items) == 1, "Currently services should only have one ingress"
        rst = []
        for ig in svc.items[0].status.load_balancer.ingress:
            if ig.hostname:
                rst.append(ig.hostname)
            if ig.ip:
                rst.append(ig.ip)
        return rst

    def _set_ext_dns(self):
        axops_eip = self._get_eip_from_config_map() or self._get_svc_eip(
            svclabel="app=axops", namespace=AXNameSpaces.AXSYS)

        if not axops_eip:
            logger.error(
                "Platform Start Failed: cannot find External IP for AXOPS")
            raise AXPlatformException("AXOPS elastic IP does not exist")

        self.cluster_dns_name = axops_eip[0]
        # Don't change format of this message. Portal parses this line to get cluster IP/DNS.
        logger.info(
            "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n",
            COLOR_GREEN, self.cluster_dns_name, COLOR_NORM)
        self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name

    def get_cluster_external_dns(self):
        if not self.cluster_dns_name:
            self._set_ext_dns()
        return self.cluster_dns_name

    def _set_autoscaling(self):
        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if asg_name is not None:
            self._replacing["ASG_NAME"] = asg_name
        else:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

    # TODO (#157) Version should only be uploaded during install and upgrade time
    def _update_version(self):
        # Software info we get during install / upgrade does not contain ami id
        # need to persist it as well
        self._software_info.ami_id = self._cluster_config.get_ami_id()

        AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id,
                  self._aws_profile).update(self._software_info.to_dict())
Example #2
0
class ClusterUpgrader(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterUpgradeConfig)
        self._cfg = cfg
        super(ClusterUpgrader, self).__init__(
            cluster_name=self._cfg.cluster_name,
            cluster_id=self._cfg.cluster_id,
            cloud_profile=self._cfg.cloud_profile
        )

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region()
        )
        self._current_software_info = SoftwareInfo(
            info_dict=yaml.load(
                self._cluster_info.download_cluster_software_info()
            )
        )
        self._cidr = str(get_public_ip()) + "/32"

    def run(self):
        self._runtime_validation()

        upgrade_kube = True
        upgrade_service = True

        if self._cfg.target_software_info.kube_installer_version == self._current_software_info.kube_installer_version \
            and self._cfg.target_software_info.kube_version == self._current_software_info.kube_version:
            upgrade_kube = False

        if self._cfg.target_software_info.image_namespace == self._current_software_info.image_namespace \
            and self._cfg.target_software_info.image_version == self._current_software_info.image_version \
            and self._cfg.target_software_info.image_version != "latest" \
            and not upgrade_kube:
            upgrade_service = False

        if not upgrade_service and not upgrade_kube and not self._cfg.force_upgrade:
            logger.info("%sCluster's software versions is not changed, not performing upgrade.%s", COLOR_GREEN, COLOR_NORM)
            logger.info("%sIf you want to force upgrade cluster, please specify --force-upgrade flag.%s", COLOR_YELLOW, COLOR_NORM)
            return


        if self._cfg.dry_run:
            logger.info("DRY RUN: upgrading cluster %s", self._name_id)
            return

        upgrade_info = "    Software Image: {}:{}  ->  {}:{}\n".format(
            self._current_software_info.image_namespace, self._current_software_info.image_version,
            self._cfg.target_software_info.image_namespace, self._cfg.target_software_info.image_version
        )
        upgrade_info += "    Kubernetes: {}  ->  {}\n".format(
            self._current_software_info.kube_version, self._cfg.target_software_info.kube_version
        )
        upgrade_info += "    Kubernetes Installer: {}  ->  {}".format(
            self._current_software_info.kube_installer_version, self._cfg.target_software_info.kube_installer_version
        )
        logger.info("\n\n%sUpgrading cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM)

        # Main pause cluster routine
        try:
            self._ensure_credentials()

            self._ensure_upgrader_access()

            ensure_manifest_temp_dir()

            if upgrade_service:
                self._shutdown_platform()

            if upgrade_kube:
                self._upgrade_kube()

            if upgrade_service:
                self._start_platform()
                self._cluster_info.upload_platform_manifests_and_config(
                    platform_manifest_root=self._cfg.manifest_root,
                    platform_config=self._cfg.bootstrap_config
                )
            logger.info("\n\n%sSuccessfully upgraded cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_upgrader_access_if_needed()

    def _runtime_validation(self):
        all_errs = []
        # Abort operation if cluster is not successfully installed
        if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"):
            all_errs.append("Cannot upgrade cluster that is not successfully installed: Stage2 information missing!")

        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause()
        if cluster_status_raw:
            all_errs.append("Upgrading a paused cluster is not currently supported. Please restart it first")

        # Abort operation if registry information changed
        if self._cfg.target_software_info.registry != self._current_software_info.registry \
            or self._cfg.target_software_info.registry_secrets != self._current_software_info.registry_secrets:
            all_errs.append("Changing registry information during upgrade is not supported currently!")

        # Abort operation if ami information changed
        if self._cfg.target_software_info.ami_name != self._current_software_info.ami_name \
            or (self._cfg.target_software_info.ami_id and self._cfg.target_software_info.ami_id != self._current_software_info.ami_id):
            all_errs.append("Upgrading AMI information is not currently supported.")

        if all_errs:
            raise RuntimeError("Upgrade aborted. Error(s): {}".format(all_errs))

    def _ensure_credentials(self):
        self._cluster_info.download_kube_config()
        self._cluster_info.download_kube_key()

    def _shutdown_platform(self):
        """
        This step shuts down platform based on the config and manifest provided
        :return:
        """
        logger.info("Shutting Argo platform ...")
        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH
        )
        platform = AXPlatform(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            config_file=TEMP_PLATFORM_CONFIG_PATH
        )
        platform.stop()
        platform.stop_monitor()

    def _upgrade_kube(self):
        """
        This function calls our script to upgrade Kubernetes and cluster nodes
        :return:
        """
        env = {
            "CLUSTER_NAME_ID": self._name_id,
            "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(),
            "OLD_KUBE_VERSION": self._current_software_info.kube_version,
            "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version,
            "NEW_CLUSTER_INSTALL_VERSION": self._cfg.target_software_info.kube_installer_version,
            "ARGO_AWS_REGION": self._cluster_config.get_region()
        }
        
        if self._cfg.cloud_profile:
            env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile
        else:
            env["ARGO_AWS_PROFILE"] = AWS_DEFAULT_PROFILE

        logger.info("Upgrading Kubernetes with environments %s", pformat(env))
        env.update(os.environ)
        subprocess.check_call(["upgrade-kubernetes"], env=env)


    def _start_platform(self):
        """
        This step brings up Argo platform services
        :return:
        """
        logger.info("Bringing up Argo platform ...")

        platform = AXPlatform(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            manifest_root=self._cfg.manifest_root,
            config_file=self._cfg.bootstrap_config,
            software_info=self._cfg.target_software_info
        )
        platform.start()
        platform.stop_monitor()

    def _ensure_upgrader_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info("Upgrading cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager"
            )

    def _disallow_upgrader_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info("Upgrading cluster from a not trusted IP (%s). Disallowing access.", self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager"
            )
Example #3
0
class ClusterUninstaller(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterUninstallConfig)
        self._cfg = cfg
        super(ClusterUninstaller,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)

        # Initialize node count to 1 as master is not in an auto scaling group
        self._total_nodes = 1
        self._cidr = str(get_public_ip()) + "/32"

    def pre_run(self):
        if self._cluster_info.is_cluster_supported_by_portal():
            raise RuntimeError(
                "Cluster is currently supported by portal. Please login to portal to perform cluster management operations."
            )
        # Abort operation if cluster is not successfully installed
        if not check_cluster_staging(
                cluster_info_obj=self._cluster_info,
                stage="stage2") and not self._cfg.force_uninstall:
            raise RuntimeError(
                "Cluster is not successfully installed or has already been half deleted. If you really want to uninstall the cluster, please add '--force-uninstall' flag to finish uninstalling cluster. e.g. 'argocluster uninstall --force-uninstall --cluster-name xxx'"
            )
        if not self._csm.is_running() and not self._cfg.force_uninstall:
            raise RuntimeError(
                "Cluster is not in Running state. If you really want to uninstall the cluster, please add '--force-uninstall' flag to finish uninstalling cluster. e.g. 'argocluster uninstall --force-uninstall --cluster-name xxx'"
            )
        self._csm.do_uninstall()
        self._ensure_critical_information()
        self._persist_cluster_state_if_needed()

    def post_run(self):
        return

    def run(self):
        if self._cfg.dry_run:
            logger.info("DRY RUN: Uninstalling cluster %s", self._name_id)
            return

        logger.info("%s\n\nUninstalling cluster %s%s\n", COLOR_GREEN,
                    self._name_id, COLOR_NORM)

        # Main uninstall cluster routine
        try:
            self._check_cluster_before_uninstall()

            # We only need to keep stage0 information, which is an indication of we still need to
            # clean up the Kubernetes cluster
            self._cluster_info.delete_staging_info("stage2")
            self._cluster_info.delete_staging_info("stage1")
            self._clean_up_kubernetes_cluster()

            # As _clean_up_argo_specific_cloud_infrastructure() will clean everything inside bucket
            # that is related to this cluster, stage0 information is not explicitly deleted here
            self._clean_up_argo_specific_cloud_infrastructure()

            logger.info("\n\n%sSuccessfully uninstalled cluster %s%s\n",
                        COLOR_GREEN, self._name_id, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)

    def _ensure_critical_information(self):
        """
        If not force uninstall, we don't require user to provide a cloud regions / placement and therefore
        these 2 fields in self._cfg are None. We need to load them from cluster config
        :return:
        """
        load_from_cluster_config = True
        if self._cfg.force_uninstall:
            if self._cfg.cloud_region and self._cfg.cloud_placement:
                load_from_cluster_config = False
            elif not check_cluster_staging(cluster_info_obj=self._cluster_info,
                                           stage="stage0"):
                # Fail uninstall when cluster_config does not exist and region/placement
                # information are not provided
                raise RuntimeError("""

        Cluster Stage 0 information is missing. Cluster is either not installed or it's management records in S3 are broken.
        If you believe there is still resource leftover, please provide cluster's region/placement information using
        "--cloud-placement" and "--cloud-region"
        
                    """)

        if load_from_cluster_config:
            self._cfg.cloud_region = self._cluster_config.get_region()
            self._cfg.cloud_placement = self._cluster_config.get_zone()

    def _clean_up_argo_specific_cloud_infrastructure(self):
        """
        This step cleans up components in cloud provider that are specifically needed by
        Argo cluster, including:
            - Buckets (everything under this cluster's directory)
            - Server certificates
        :return:
        """
        logger.info(
            "Cluster uninstall step: Clean Up Argo-specific Infrastructure")
        AXClusterBuckets(self._name_id, self._cfg.cloud_profile,
                         self._cfg.cloud_region).delete()

        # Delete server certificates: This code is deleting the default server certificates created
        # by public and private elb. Since server certs cannot be tagged, we need to delete them this way.
        certname = ManagedElb.get_elb_name(self._name_id, "ing-pub")
        delete_server_certificate(self._cfg.cloud_profile, certname)
        certname = ManagedElb.get_elb_name(self._name_id, "ing-pri")
        delete_server_certificate(self._cfg.cloud_profile, certname)

    def _clean_up_kubernetes_cluster(self):
        """
        This step cleans up Kubernetes if needed. It only touches components in cloud provider that
        Kubernetes needs, including:
            - Load Balancers
            - Instances
            - Auto scaling groups
            - launch configurations
            - Volumes
            - Security groups
            - Elastic IPs
            - VPCs (If this VPC is not shared)
        :return:
        """
        if not check_cluster_staging(
                cluster_info_obj=self._cluster_info,
                stage="stage0") and not self._cfg.force_uninstall:
            logger.info("Skip clean up Kubernetes cluster")
            return

        logger.info("Cluster uninstall step: Clean Up Kubernetes Cluster")

        if self._cfg.force_uninstall:
            msg = "{}\n\nIt is possible that cluster S3 bucket is accidentally deleted,\n".format(
                COLOR_YELLOW)
            msg += "or S3 bucket information has been altered unintentionally. In this\n"
            msg += "case, we still try to delete cluster since this is force uninstall.\n"
            msg += "NOTE: cluster deletion might NOT be successful and still requires\n"
            msg += "user to clean up left-over resources manually.{}\n".format(
                COLOR_NORM)
            logger.warning(msg)

        env = {
            "KUBERNETES_PROVIDER": self._cfg.cloud_provider,
            "KUBE_AWS_ZONE": self._cfg.cloud_placement,
            "KUBE_AWS_INSTANCE_PREFIX": self._name_id
        }

        if self._cfg.cloud_profile:
            env["AWS_DEFAULT_PROFILE"] = self._cfg.cloud_profile

        logger.info("\n\n%sCalling kube-down ...%s\n", COLOR_GREEN, COLOR_NORM)
        AXKubeUpDown(cluster_name_id=self._name_id,
                     env=env,
                     aws_profile=self._cfg.cloud_profile).down()

        # TODO (#111): revise volume teardown in GCP
        if Cloud().target_cloud_aws():
            delete_tagged_ebs(aws_profile=self._cfg.cloud_profile,
                              tag_key=COMMON_CLOUD_RESOURCE_TAG_KEY,
                              tag_value=self._name_id,
                              region=self._cfg.cloud_region)

    def _check_cluster_before_uninstall(self):
        """
        This step does sanity check before uninstalling the cluster.
        :return:
        """
        if not self._cfg.force_uninstall:
            logger.info("Cluster uninstall step: Sanity Checking")
            self._cluster_info.download_kube_config()
            self._ensure_uninstaller_access()
            self._check_cluster_fixture(kube_config_path=self._cluster_info.
                                        get_kube_config_file_path())
        else:
            msg = "{}\n\nForce uninstall: Skip checking cluster. Note that uninstall might fail if there is\n".format(
                COLOR_YELLOW)
            msg += "still managed fixture hooked up with cluster. In case cluster uninstall failed due to AWS\n"
            msg += "resource dependency, please manually clean up those resources and retry uninstall.\n{}".format(
                COLOR_NORM)
            logger.warning(msg)

    @staticmethod
    def _check_cluster_fixture(kube_config_path):
        """
        This step checks if the cluster has any fixture hooked up.
            - If there are fixtures hooked up, we abort uninstall, as we don't know how to tear down managed
               fixtures when we clean up cloud resources
            - If we don't know whether there is fixture or not, we print out a warning for now and continue
        :param kube_config_path: path to kube_config
        :return:
        """
        with open(kube_config_path, "r") as f:
            config_data = f.read()
        kube_config = yaml.load(config_data)
        username = None
        password = None

        # All kubeconfig we generate has only 1 cluster
        server = kube_config["clusters"][0]["cluster"]["server"]

        for user in kube_config.get("users", []):
            u = user["user"]
            if u.get("username", ""):
                username = u.get("username")
                password = u.get("password")
                break
        if not (username and password):
            logger.warning(
                "%sFailed to check managed fixture because Kubernetes credentials cannot be found to access cluster%s",
                COLOR_YELLOW, COLOR_NORM)
            return

        cmd = [
            "curl", "--insecure", "--silent", "-u",
            "{}:{}".format(username, password), "--max-time", "15",
            "{server}/api/v1/proxy/namespaces/axsys/services/fixturemanager/v1/fixture/instances?deleted=false"
            .format(server=server)
        ]

        try:
            ret = subprocess.check_output(cmd)
        except subprocess.CalledProcessError as cpe:
            msg = "{}\n\nFailed to check cluster fixture state due to {}. Cluster might\n".format(
                COLOR_YELLOW, cpe)
            msg += "not be healthy. We will proceed to uninstall cluster with best effort. Note if there are\n"
            msg += "fixtures that are not cleaned up, uninstall can fail. You can manually\n"
            msg += "clean them up and uninstall again.\n{}".format(COLOR_NORM)
            logger.warning(msg)
            return

        if ret:
            try:
                fixture = json.loads(ret).get("data", [])
                if fixture:
                    logger.error("Remaining fixtures:\n%s", fixture)
                    raise RuntimeError(
                        "Please cleanup all fixtures before doing uninstall. Or use '--force-uninstall' option to skip this check"
                    )
                else:
                    logger.info(
                        "Cluster has no fixture hooked up, proceed to uninstall."
                    )
            except ValueError as ve:
                # In case cluster is not healthy, command output will not be able to loaded
                # as json. Currently treat it same as "Cannot get fixture data" case
                logger.warning(
                    "Cannot parse fixture info: %s. Assume cluster has no fixture, proceed to uninstall. Fixture info: %s",
                    ve, ret)
        else:
            logger.warning(
                "Cannot get fixture data. Assume that cluster has no fixture hooked up, proceed to uninstall."
            )

    def _ensure_uninstaller_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.",
                self._cidr)
            bootstrap = AXBootstrap(cluster_name_id=self._name_id,
                                    aws_profile=self._cfg.cloud_profile,
                                    region=self._cfg.cloud_region)
            bootstrap.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager")
Example #4
0
class ClusterResumer(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterRestartConfig)
        self._cfg = cfg
        super(ClusterResumer,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())

        # Initialize node count to 1 as master is not in an auto scaling group
        self._total_nodes = 1
        self._cidr = str(get_public_ip()) + "/32"
        self._software_info = SoftwareInfo(info_dict=yaml.load(
            self._cluster_info.download_cluster_software_info()))

    def pre_run(self):
        if self._cluster_info.is_cluster_supported_by_portal():
            raise RuntimeError(
                "Cluster is currently supported by portal. Please login to portal to perform cluster management operations."
            )

        if self._csm.is_running():
            logger.info("Cluster is already running.")
            sys.exit(0)
        if not check_cluster_staging(cluster_info_obj=self._cluster_info,
                                     stage="stage2"):
            raise RuntimeError(
                "Cluster is not successfully installed: Stage2 information missing! Operation aborted."
            )
        self._csm.do_resume()
        self._persist_cluster_state_if_needed()

    def post_run(self):
        self._csm.done_resume()
        self._persist_cluster_state_if_needed()

    def run(self):
        if self._cfg.dry_run:
            logger.info("DRY RUN: Resuming cluster %s with software info %s",
                        self._name_id, self._software_info.to_dict())
            return

        logger.info("%s\n\nResuming cluster %s%s\n", COLOR_GREEN,
                    self._name_id, COLOR_NORM)
        # Main resume cluster routine
        try:
            self._master_manager.restart_master()
            self._recover_auto_scaling_groups()
            self._wait_for_master()
            self._ensure_restarter_access()
            self._wait_for_minions()
            ensure_manifest_temp_dir()
            self._start_platform()
            logger.info("\n\n%sSuccessfully resumed cluster %s%s\n",
                        COLOR_GREEN, self._name_id, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_restarter_access_if_needed()

    def _start_platform(self):
        """
        This step brings up Argo platform services
        :return:
        """
        logger.info("Bringing up Argo platform ...")

        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH)

        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
                              config_file=TEMP_PLATFORM_CONFIG_PATH,
                              software_info=self._software_info)
        platform.start()
        platform.stop_monitor()

    def _wait_for_master(self):
        """
        This step waits for master to be up and running
        :return:
        """
        count = 0
        running_master = None
        while count < WAIT_FOR_RUNNING_MASTER_RETRY:
            logger.info(
                "Waiting for master to be up and running. Trail %s / %s",
                count, WAIT_FOR_RUNNING_MASTER_RETRY)
            running_master = self._master_manager.discover_master(
                state=[EC2InstanceState.Running])
            if not running_master:
                time.sleep(5)
            else:
                logger.info("%sMaster %s is running%s", COLOR_GREEN,
                            running_master, COLOR_NORM)
                break
            count += 1
        if count == WAIT_FOR_RUNNING_MASTER_RETRY:
            raise RuntimeError(
                "Timeout waiting for master {} to come up. Please manually check cluster status"
                .format(running_master))

    def _wait_for_minions(self):
        """
        This step waits for all minions to come up and registered in Kubernetes master
        :return:
        """
        # Get kubernetes access token
        self._cluster_info.download_kube_config()
        kube_config = self._cluster_info.get_kube_config_file_path()

        # Wait for nodes to be ready.
        # Because we made sure during pause that kubernetes master already knows that all minions are gone,
        # we don't need to worry about cached minions here
        logger.info("Wait 120 seconds before Kubernetes master comes up ...")
        time.sleep(120)
        kubectl = KubernetesApiClient(config_file=kube_config)
        logger.info("Waiting for all Kubelets to be ready ...")

        trail = 0
        while True:
            try:
                all_kubelets_ready = True
                nodes = kubectl.api.list_node()
                logger.info("%s / %s nodes registered", len(nodes.items),
                            self._total_nodes)
                if len(nodes.items) < self._total_nodes:
                    all_kubelets_ready = False
                else:
                    for n in nodes.items:
                        kubelet_check = {
                            "KubeletHasSufficientDisk",
                            "KubeletHasSufficientMemory",
                            "KubeletHasNoDiskPressure", "KubeletReady",
                            "RouteCreated"
                        }
                        for cond in n.status.conditions:
                            if cond.reason in kubelet_check:
                                kubelet_check.remove(cond.reason)
                        if kubelet_check:
                            logger.info(
                                "Node %s not ready yet. Remaining Kubelet checkmarks: %s",
                                n.metadata.name, kubelet_check)
                            all_kubelets_ready = False
                            break
                        else:
                            logger.info("Node %s is ready.", n.metadata.name)
                if all_kubelets_ready:
                    logger.info("All Kubelets are ready")
                    break
            except Exception as e:
                if "Max retries exceeded" in str(e):
                    # If master API server is still not ready at this moment, we don't count as a trail
                    trail -= 1
                    logger.info("Kubernetes API server not ready yet")
                else:
                    logger.exception("Caught exception when listing nodes: %s",
                                     e)
            trail += 1
            if trail > WAIT_FOR_MINION_REG_RETRY:
                raise RuntimeError(
                    "Timeout waiting for minions to come up. Please manually check cluster status"
                )
            time.sleep(10)

    def _recover_auto_scaling_groups(self):
        """
        This steps does the following:
            - fetch the previously restored auto scaling group config. If this config cannot be found,
              we can assume that all autoscaling groups have correct configurations. This could happen
              when previous restart failed in the middle but passed this stage already, or the cluster is
              not even paused
            - Wait for all instances to be in service
        :return:
        """
        # Get previously persisted asg status
        logger.info("Fetching last cluster status ...")
        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause(
        )

        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())

        if cluster_status_raw:
            logger.info("Found last cluster status, restoring cluster ...")
            cluster_status = yaml.load(cluster_status_raw)
            all_asg_statuses = cluster_status["asg_status"]

            # Restore minions
            for asg_name in all_asg_statuses.keys():
                asg_status = all_asg_statuses[asg_name]
                min_size = asg_status["min_size"]
                max_size = asg_status["max_size"]
                desired = asg_status["desired_capacity"]
                self._total_nodes += desired
                logger.info(
                    "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s",
                    asg_name, min_size, max_size, desired)
                asg_mgr.set_asg_spec(name=asg_name,
                                     minsize=min_size,
                                     maxsize=max_size,
                                     desired=desired)

            logger.info("Waiting for all auto scaling groups to scale up ...")
            asg_mgr.wait_for_desired_asg_state()
            logger.info("%sAll cluster instances are in service%s",
                        COLOR_GREEN, COLOR_NORM)

            # Delete previously stored cluster status
            self._cluster_info.delete_cluster_status_before_pause()
        else:
            all_asgs = asg_mgr.get_all_asgs()
            for asg in all_asgs:
                self._total_nodes += asg["DesiredCapacity"]

            logger.info(
                "Cannot find last cluster status, cluster already resumed with %s nodes",
                self._total_nodes)

    def _ensure_restarter_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Restarting cluster from a not trusted IP (%s). Temporarily allowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager")

    def _disallow_restarter_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Restarting cluster from a not trusted IP (%s). Disallowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager")
Example #5
0
class ClusterPauser(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterPauseConfig)
        self._cfg = cfg
        super(ClusterPauser,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())
        self._cidr = str(get_public_ip()) + "/32"

    def pre_run(self):
        if self._cluster_info.is_cluster_supported_by_portal():
            raise RuntimeError(
                "Cluster is currently supported by portal. Please login to portal to perform cluster management operations."
            )
        if self._csm.is_paused():
            logger.info("Cluster is already paused.")
            sys.exit(0)

        # This is for backward compatibility
        if not check_cluster_staging(cluster_info_obj=self._cluster_info,
                                     stage="stage2"):
            raise RuntimeError(
                "Cluster is not successfully installed: Stage2 information missing! Operation aborted."
            )
        self._csm.do_pause()
        self._persist_cluster_state_if_needed()

    def run(self):
        if self._cfg.dry_run:
            logger.info("DRY RUN: pausing cluster %s", self._name_id)
            return

        # Check if cluster's master is paused already. Since terminating master is the very last thing
        # of pausing cluster, if master is already stopped, cluster has already been successfully paused
        stopped_master = self._master_manager.discover_master(
            state=[EC2InstanceState.Stopped])
        if stopped_master:
            logger.info(
                "\n\n%sMaster %s already stopped. Cluster %s already paused%s\n",
                COLOR_GREEN, stopped_master, self._name_id, COLOR_NORM)
            return
        else:
            logger.info("\n\n%sPausing cluster %s%s\n", COLOR_GREEN,
                        self._name_id, COLOR_NORM)

        # Main pause cluster routine
        try:
            self._ensure_pauser_access()
            ensure_manifest_temp_dir()
            self._shutdown_platform()
            self._scale_down_auto_scaling_groups()
            self._wait_for_deregistering_minions()
            logger.info("Stopping master ...")
            self._master_manager.stop_master()
            logger.info("\n\n%sSuccessfully paused cluster %s%s\n",
                        COLOR_GREEN, self._name_id, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_pauser_access_if_needed()

    def post_run(self):
        self._csm.done_pause()
        self._persist_cluster_state_if_needed()

    def _wait_for_deregistering_minions(self):
        """
        This step waits for all minions to be de-registered from Kubernetes master,
        e.g. `kubectl get nodes` returns no minions besides master
        :return:
        """
        # Wait for kubernetes master de-register all minions
        logger.info(
            "Waiting for Kubernetes master to de-register all existing minions"
        )
        self._cluster_info.download_kube_config()
        kube_config = self._cluster_info.get_kube_config_file_path()
        kubectl = KubernetesApiClient(config_file=kube_config)
        while True:
            try:
                nodes = kubectl.api.list_node()
                node_names = []

                # list nodes should only show master now
                if len(nodes.items) > 1:
                    for n in nodes.items:
                        node_names.append(n.metadata.name)
                    logger.info("Remaining Kubernetes minions: %s", node_names)
                else:
                    # I don't see it necessary to check if the remaining node is master or not
                    logger.info("%sAll minions de-registered from master%s",
                                COLOR_GREEN, COLOR_NORM)
                    break
            except Exception as e:
                logger.warning("Caught exception when listing nodes: %s", e)
            time.sleep(15)

    def _scale_down_auto_scaling_groups(self):
        """
        This step:
            - Persist autoscaling group states to S3,
            - Scale down all autoscaling groups to zero,
            - Wait for all minion to be terminated
        :return:
        """
        logger.info("Discovering autoscaling groups")
        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())
        all_asgs = asg_mgr.get_all_asgs()

        # Generate cluster status before pause. This is used to recover same amount of nodes
        # when we want to restart cluster
        cluster_status = {"asg_status": {}}
        for asg in all_asgs:
            cluster_status["asg_status"][asg["AutoScalingGroupName"]] = {
                "min_size": asg["MinSize"],
                "max_size": asg["MaxSize"],
                "desired_capacity": asg["DesiredCapacity"]
            }
        self._cluster_info.upload_cluster_status_before_pause(
            status=yaml.dump(cluster_status))

        # Scale down asg
        logger.info("Scaling down autoscaling groups ...")
        for asg in all_asgs:
            asg_name = asg["AutoScalingGroupName"]
            asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0)

        # Waiting for nodes to be terminated
        logger.info("Waiting for all auto scaling groups to scale down ...")
        asg_mgr.wait_for_desired_asg_state()
        logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN,
                    COLOR_NORM)

    def _shutdown_platform(self):
        """
        This step shuts down platform based on the config and manifest provided
        :return:
        """
        logger.info("Shutting platform for pausing the cluster ...")
        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH)
        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
                              config_file=TEMP_PLATFORM_CONFIG_PATH)
        platform.stop()
        platform.stop_monitor()

    def _ensure_pauser_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager")

    def _disallow_pauser_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Pausing cluster from a not trusted IP (%s). Disallowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager")