Ejemplo n.º 1
0
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterUpgradeConfig)
        self._cfg = cfg
        super(ClusterUpgrader, self).__init__(
            cluster_name=self._cfg.cluster_name,
            cluster_id=self._cfg.cluster_id,
            cloud_profile=self._cfg.cloud_profile
        )

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region()
        )
        self._current_software_info = SoftwareInfo(
            info_dict=yaml.load(
                self._cluster_info.download_cluster_software_info()
            )
        )
        self._cidr = str(get_public_ip()) + "/32"
Ejemplo n.º 2
0
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterRestartConfig)
        self._cfg = cfg
        super(ClusterResumer,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())

        # Initialize node count to 1 as master is not in an auto scaling group
        self._total_nodes = 1
        self._cidr = str(get_public_ip()) + "/32"
        self._software_info = SoftwareInfo(info_dict=yaml.load(
            self._cluster_info.download_cluster_software_info()))
Ejemplo n.º 3
0
 def _ensure_uninstaller_access(self):
     if self._cidr not in self._cluster_config.get_trusted_cidr():
         logger.info(
             "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.",
             self._cidr)
         bootstrap = AXBootstrap(cluster_name_id=self._name_id,
                                 aws_profile=self._cfg.cloud_profile,
                                 region=self._cfg.cloud_region)
         bootstrap.modify_node_security_groups(
             old_cidr=[],
             new_cidr=[self._cidr],
             action_name="allow-cluster-manager")
Ejemplo n.º 4
0
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterPauseConfig)
        self._cfg = cfg
        super(ClusterPauser,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())
        self._cidr = str(get_public_ip()) + "/32"
Ejemplo n.º 5
0
class ClusterUpgrader(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterUpgradeConfig)
        self._cfg = cfg
        super(ClusterUpgrader, self).__init__(
            cluster_name=self._cfg.cluster_name,
            cluster_id=self._cfg.cluster_id,
            cloud_profile=self._cfg.cloud_profile
        )

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile
        )
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region()
        )
        self._current_software_info = SoftwareInfo(
            info_dict=yaml.load(
                self._cluster_info.download_cluster_software_info()
            )
        )
        self._cidr = str(get_public_ip()) + "/32"

    def run(self):
        self._runtime_validation()

        upgrade_kube = True
        upgrade_service = True

        if self._cfg.target_software_info.kube_installer_version == self._current_software_info.kube_installer_version \
            and self._cfg.target_software_info.kube_version == self._current_software_info.kube_version:
            upgrade_kube = False

        if self._cfg.target_software_info.image_namespace == self._current_software_info.image_namespace \
            and self._cfg.target_software_info.image_version == self._current_software_info.image_version \
            and self._cfg.target_software_info.image_version != "latest" \
            and not upgrade_kube:
            upgrade_service = False

        if not upgrade_service and not upgrade_kube and not self._cfg.force_upgrade:
            logger.info("%sCluster's software versions is not changed, not performing upgrade.%s", COLOR_GREEN, COLOR_NORM)
            logger.info("%sIf you want to force upgrade cluster, please specify --force-upgrade flag.%s", COLOR_YELLOW, COLOR_NORM)
            return


        if self._cfg.dry_run:
            logger.info("DRY RUN: upgrading cluster %s", self._name_id)
            return

        upgrade_info = "    Software Image: {}:{}  ->  {}:{}\n".format(
            self._current_software_info.image_namespace, self._current_software_info.image_version,
            self._cfg.target_software_info.image_namespace, self._cfg.target_software_info.image_version
        )
        upgrade_info += "    Kubernetes: {}  ->  {}\n".format(
            self._current_software_info.kube_version, self._cfg.target_software_info.kube_version
        )
        upgrade_info += "    Kubernetes Installer: {}  ->  {}".format(
            self._current_software_info.kube_installer_version, self._cfg.target_software_info.kube_installer_version
        )
        logger.info("\n\n%sUpgrading cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM)

        # Main pause cluster routine
        try:
            self._ensure_credentials()

            self._ensure_upgrader_access()

            ensure_manifest_temp_dir()

            if upgrade_service:
                self._shutdown_platform()

            if upgrade_kube:
                self._upgrade_kube()

            if upgrade_service:
                self._start_platform()
                self._cluster_info.upload_platform_manifests_and_config(
                    platform_manifest_root=self._cfg.manifest_root,
                    platform_config=self._cfg.bootstrap_config
                )
            logger.info("\n\n%sSuccessfully upgraded cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_upgrader_access_if_needed()

    def _runtime_validation(self):
        all_errs = []
        # Abort operation if cluster is not successfully installed
        if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"):
            all_errs.append("Cannot upgrade cluster that is not successfully installed: Stage2 information missing!")

        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause()
        if cluster_status_raw:
            all_errs.append("Upgrading a paused cluster is not currently supported. Please restart it first")

        # Abort operation if registry information changed
        if self._cfg.target_software_info.registry != self._current_software_info.registry \
            or self._cfg.target_software_info.registry_secrets != self._current_software_info.registry_secrets:
            all_errs.append("Changing registry information during upgrade is not supported currently!")

        # Abort operation if ami information changed
        if self._cfg.target_software_info.ami_name != self._current_software_info.ami_name \
            or (self._cfg.target_software_info.ami_id and self._cfg.target_software_info.ami_id != self._current_software_info.ami_id):
            all_errs.append("Upgrading AMI information is not currently supported.")

        if all_errs:
            raise RuntimeError("Upgrade aborted. Error(s): {}".format(all_errs))

    def _ensure_credentials(self):
        self._cluster_info.download_kube_config()
        self._cluster_info.download_kube_key()

    def _shutdown_platform(self):
        """
        This step shuts down platform based on the config and manifest provided
        :return:
        """
        logger.info("Shutting Argo platform ...")
        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH
        )
        platform = AXPlatform(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            config_file=TEMP_PLATFORM_CONFIG_PATH
        )
        platform.stop()
        platform.stop_monitor()

    def _upgrade_kube(self):
        """
        This function calls our script to upgrade Kubernetes and cluster nodes
        :return:
        """
        env = {
            "CLUSTER_NAME_ID": self._name_id,
            "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(),
            "OLD_KUBE_VERSION": self._current_software_info.kube_version,
            "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version,
            "NEW_CLUSTER_INSTALL_VERSION": self._cfg.target_software_info.kube_installer_version,
            "ARGO_AWS_REGION": self._cluster_config.get_region()
        }
        
        if self._cfg.cloud_profile:
            env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile
        else:
            env["ARGO_AWS_PROFILE"] = AWS_DEFAULT_PROFILE

        logger.info("Upgrading Kubernetes with environments %s", pformat(env))
        env.update(os.environ)
        subprocess.check_call(["upgrade-kubernetes"], env=env)


    def _start_platform(self):
        """
        This step brings up Argo platform services
        :return:
        """
        logger.info("Bringing up Argo platform ...")

        platform = AXPlatform(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            manifest_root=self._cfg.manifest_root,
            config_file=self._cfg.bootstrap_config,
            software_info=self._cfg.target_software_info
        )
        platform.start()
        platform.stop_monitor()

    def _ensure_upgrader_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info("Upgrading cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager"
            )

    def _disallow_upgrader_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info("Upgrading cluster from a not trusted IP (%s). Disallowing access.", self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager"
            )
Ejemplo n.º 6
0
    def _ensure_argo_microservices(self):
        """
        This step won't run if there is "--dry-run" specified.

        This step assumes there is a running Kubernetes cluster. This step does the following:
            - ensure ASG count
            - ensure trusted CIDRs
            - install Argo software on to the cluster and make sure they are up and running (We don't monitor
               if the microservice is having a crash loop)
            - Remove manager CIDR if it is not part of user-specified trusted CIDRs
            - Upload stage2 information to S3

        Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and
        all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop
        This step is idempotent
        :return: cluster_dns_name, username, password
        """
        logger.info("Cluster installation step: Ensure Argo Micro-services")

        # Reload config in case stage0 and stage1 are skipped
        self._cluster_config.reload_config()

        trusted_cidrs = self._cluster_config.get_trusted_cidr()

        # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to
        # perform using that object.
        axbootstrap = AXBootstrap(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile,
                                  region=self._cluster_config.get_region())

        # We allow access from everywhere during installation phase, but will remove this access
        # if user does not specify 0.0.0.0/0 as their trusted CIDR
        axbootstrap.modify_node_security_groups(old_cidr=[],
                                                new_cidr=trusted_cidrs +
                                                [EC2IPPermission.AllIP],
                                                action_name="allow-creator")

        if check_cluster_staging(self._cluster_info, "stage2"):
            # TODO: some duplicated logic here, might need to combine them.
            logger.info(
                "Skip ensure Argo micro-services since cluster has already been successfully installed"
            )
            platform = AXPlatform(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile)
            if EC2IPPermission.AllIP not in trusted_cidrs:
                axbootstrap.modify_node_security_groups(
                    old_cidr=[EC2IPPermission.AllIP],
                    new_cidr=[],
                    action_name="disallow-creator")
            return platform.get_cluster_external_dns(), "", ""

        # Modify ASG
        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = int(
            self._cluster_config.get_min_node_count()) - axsys_node_count
        axuser_max_count = int(
            self._cluster_config.get_max_node_count()) - axsys_node_count
        axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count)

        cluster_dns, username, password = self.install_and_run_platform()

        self.post_install()

        # Remove access from 0.0.0.0/0 if this is not what user specifies
        if EC2IPPermission.AllIP not in trusted_cidrs:
            axbootstrap.modify_node_security_groups(
                old_cidr=[EC2IPPermission.AllIP],
                new_cidr=[],
                action_name="disallow-creator")

        return cluster_dns_name, username, password
Ejemplo n.º 7
0
class ClusterResumer(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterRestartConfig)
        self._cfg = cfg
        super(ClusterResumer,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())

        # Initialize node count to 1 as master is not in an auto scaling group
        self._total_nodes = 1
        self._cidr = str(get_public_ip()) + "/32"
        self._software_info = SoftwareInfo(info_dict=yaml.load(
            self._cluster_info.download_cluster_software_info()))

    def pre_run(self):
        if self._cluster_info.is_cluster_supported_by_portal():
            raise RuntimeError(
                "Cluster is currently supported by portal. Please login to portal to perform cluster management operations."
            )

        if self._csm.is_running():
            logger.info("Cluster is already running.")
            sys.exit(0)
        if not check_cluster_staging(cluster_info_obj=self._cluster_info,
                                     stage="stage2"):
            raise RuntimeError(
                "Cluster is not successfully installed: Stage2 information missing! Operation aborted."
            )
        self._csm.do_resume()
        self._persist_cluster_state_if_needed()

    def post_run(self):
        self._csm.done_resume()
        self._persist_cluster_state_if_needed()

    def run(self):
        if self._cfg.dry_run:
            logger.info("DRY RUN: Resuming cluster %s with software info %s",
                        self._name_id, self._software_info.to_dict())
            return

        logger.info("%s\n\nResuming cluster %s%s\n", COLOR_GREEN,
                    self._name_id, COLOR_NORM)
        # Main resume cluster routine
        try:
            self._master_manager.restart_master()
            self._recover_auto_scaling_groups()
            self._wait_for_master()
            self._ensure_restarter_access()
            self._wait_for_minions()
            ensure_manifest_temp_dir()
            self._start_platform()
            logger.info("\n\n%sSuccessfully resumed cluster %s%s\n",
                        COLOR_GREEN, self._name_id, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_restarter_access_if_needed()

    def _start_platform(self):
        """
        This step brings up Argo platform services
        :return:
        """
        logger.info("Bringing up Argo platform ...")

        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH)

        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
                              config_file=TEMP_PLATFORM_CONFIG_PATH,
                              software_info=self._software_info)
        platform.start()
        platform.stop_monitor()

    def _wait_for_master(self):
        """
        This step waits for master to be up and running
        :return:
        """
        count = 0
        running_master = None
        while count < WAIT_FOR_RUNNING_MASTER_RETRY:
            logger.info(
                "Waiting for master to be up and running. Trail %s / %s",
                count, WAIT_FOR_RUNNING_MASTER_RETRY)
            running_master = self._master_manager.discover_master(
                state=[EC2InstanceState.Running])
            if not running_master:
                time.sleep(5)
            else:
                logger.info("%sMaster %s is running%s", COLOR_GREEN,
                            running_master, COLOR_NORM)
                break
            count += 1
        if count == WAIT_FOR_RUNNING_MASTER_RETRY:
            raise RuntimeError(
                "Timeout waiting for master {} to come up. Please manually check cluster status"
                .format(running_master))

    def _wait_for_minions(self):
        """
        This step waits for all minions to come up and registered in Kubernetes master
        :return:
        """
        # Get kubernetes access token
        self._cluster_info.download_kube_config()
        kube_config = self._cluster_info.get_kube_config_file_path()

        # Wait for nodes to be ready.
        # Because we made sure during pause that kubernetes master already knows that all minions are gone,
        # we don't need to worry about cached minions here
        logger.info("Wait 120 seconds before Kubernetes master comes up ...")
        time.sleep(120)
        kubectl = KubernetesApiClient(config_file=kube_config)
        logger.info("Waiting for all Kubelets to be ready ...")

        trail = 0
        while True:
            try:
                all_kubelets_ready = True
                nodes = kubectl.api.list_node()
                logger.info("%s / %s nodes registered", len(nodes.items),
                            self._total_nodes)
                if len(nodes.items) < self._total_nodes:
                    all_kubelets_ready = False
                else:
                    for n in nodes.items:
                        kubelet_check = {
                            "KubeletHasSufficientDisk",
                            "KubeletHasSufficientMemory",
                            "KubeletHasNoDiskPressure", "KubeletReady",
                            "RouteCreated"
                        }
                        for cond in n.status.conditions:
                            if cond.reason in kubelet_check:
                                kubelet_check.remove(cond.reason)
                        if kubelet_check:
                            logger.info(
                                "Node %s not ready yet. Remaining Kubelet checkmarks: %s",
                                n.metadata.name, kubelet_check)
                            all_kubelets_ready = False
                            break
                        else:
                            logger.info("Node %s is ready.", n.metadata.name)
                if all_kubelets_ready:
                    logger.info("All Kubelets are ready")
                    break
            except Exception as e:
                if "Max retries exceeded" in str(e):
                    # If master API server is still not ready at this moment, we don't count as a trail
                    trail -= 1
                    logger.info("Kubernetes API server not ready yet")
                else:
                    logger.exception("Caught exception when listing nodes: %s",
                                     e)
            trail += 1
            if trail > WAIT_FOR_MINION_REG_RETRY:
                raise RuntimeError(
                    "Timeout waiting for minions to come up. Please manually check cluster status"
                )
            time.sleep(10)

    def _recover_auto_scaling_groups(self):
        """
        This steps does the following:
            - fetch the previously restored auto scaling group config. If this config cannot be found,
              we can assume that all autoscaling groups have correct configurations. This could happen
              when previous restart failed in the middle but passed this stage already, or the cluster is
              not even paused
            - Wait for all instances to be in service
        :return:
        """
        # Get previously persisted asg status
        logger.info("Fetching last cluster status ...")
        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause(
        )

        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())

        if cluster_status_raw:
            logger.info("Found last cluster status, restoring cluster ...")
            cluster_status = yaml.load(cluster_status_raw)
            all_asg_statuses = cluster_status["asg_status"]

            # Restore minions
            for asg_name in all_asg_statuses.keys():
                asg_status = all_asg_statuses[asg_name]
                min_size = asg_status["min_size"]
                max_size = asg_status["max_size"]
                desired = asg_status["desired_capacity"]
                self._total_nodes += desired
                logger.info(
                    "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s",
                    asg_name, min_size, max_size, desired)
                asg_mgr.set_asg_spec(name=asg_name,
                                     minsize=min_size,
                                     maxsize=max_size,
                                     desired=desired)

            logger.info("Waiting for all auto scaling groups to scale up ...")
            asg_mgr.wait_for_desired_asg_state()
            logger.info("%sAll cluster instances are in service%s",
                        COLOR_GREEN, COLOR_NORM)

            # Delete previously stored cluster status
            self._cluster_info.delete_cluster_status_before_pause()
        else:
            all_asgs = asg_mgr.get_all_asgs()
            for asg in all_asgs:
                self._total_nodes += asg["DesiredCapacity"]

            logger.info(
                "Cannot find last cluster status, cluster already resumed with %s nodes",
                self._total_nodes)

    def _ensure_restarter_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Restarting cluster from a not trusted IP (%s). Temporarily allowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager")

    def _disallow_restarter_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Restarting cluster from a not trusted IP (%s). Disallowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager")
Ejemplo n.º 8
0
    def _ensure_argo_microservices(self):
        """
        This step won't run if there is "--dry-run" specified.

        This step assumes there is a running Kubernetes cluster. This step does the following:
            - ensure ASG count
            - ensure trusted CIDRs
            - install Argo software on to the cluster and make sure they are up and running (We don't monitor
               if the microservice is having a crash loop)
            - Remove manager CIDR if it is not part of user-specified trusted CIDRs
            - Upload stage2 information to S3

        Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and
        all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop
        This step is idempotent
        :return: cluster_dns_name, username, password
        """
        logger.info("Cluster installation step: Ensure Argo Micro-services")

        # Reload config in case stage0 and stage1 are skipped
        self._cluster_config.reload_config()

        trusted_cidrs = self._cluster_config.get_trusted_cidr()

        # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to
        # perform using that object.
        axbootstrap = AXBootstrap(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile,
                                  region=self._cluster_config.get_region())

        # We allow access from everywhere during installation phase, but will remove this access
        # if user does not specify 0.0.0.0/0 as their trusted CIDR
        axbootstrap.modify_node_security_groups(old_cidr=[],
                                                new_cidr=trusted_cidrs +
                                                [EC2IPPermission.AllIP],
                                                action_name="allow-creator")

        if check_cluster_staging(self._cluster_info, "stage2"):
            # TODO: some duplicated logic here, might need to combine them.
            logger.info(
                "Skip ensure Argo micro-services since cluster has already been successfully installed"
            )
            platform = AXPlatform(cluster_name_id=self._name_id,
                                  aws_profile=self._cfg.cloud_profile)
            if EC2IPPermission.AllIP not in trusted_cidrs:
                axbootstrap.modify_node_security_groups(
                    old_cidr=[EC2IPPermission.AllIP],
                    new_cidr=[],
                    action_name="disallow-creator")
            return platform.get_cluster_external_dns(), "", ""

        # Modify ASG
        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = int(
            self._cluster_config.get_min_node_count()) - axsys_node_count
        axuser_max_count = int(
            self._cluster_config.get_max_node_count()) - axsys_node_count
        axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count)

        # Install Argo micro-services
        # Platform install
        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=self._cfg.manifest_root,
                              config_file=self._cfg.bootstrap_config)

        install_platform_failed = False
        install_platform_failure_message = ""
        try:
            platform.start()
            platform.stop_monitor()
        except Exception as e:
            logger.exception(e)
            install_platform_failed = True
            install_platform_failure_message = str(
                e
            ) + "\nPlease manually check the cluster status and retry installation with same command if the error is transient."

        if install_platform_failed:
            raise RuntimeError(install_platform_failure_message)

        # In case platform is successfully installed,
        # connect to axops to get initial username and password
        username, password = self._get_initial_cluster_credentials()

        # Remove access from 0.0.0.0/0 if this is not what user specifies
        if EC2IPPermission.AllIP not in trusted_cidrs:
            axbootstrap.modify_node_security_groups(
                old_cidr=[EC2IPPermission.AllIP],
                new_cidr=[],
                action_name="disallow-creator")

        # Persist manifests to S3
        self._cluster_info.upload_platform_manifests_and_config(
            platform_manifest_root=self._cfg.manifest_root,
            platform_config=self._cfg.bootstrap_config)

        # Finally persist stage2 information
        self._cluster_info.upload_staging_info(stage="stage2", msg="stage2")
        logger.info(
            "Cluster installation step: Ensure Argo Micro-services successfully finished"
        )
        return platform.cluster_dns_name, username, password
Ejemplo n.º 9
0
Archivo: rest.py Proyecto: nuaays/argo
def update_cluster_sg_aws():
    """
    Ensure argo cluster is opened to the given trusted_cidrs
    Data input format:
    {
        "trusted_cidrs": ["1.1.1.1/32", "2.2.2.2/32", "3.3.3.3/32"]
    }
    :return:
    """
    data = request.get_json()
    ip_ranges = data.get("trusted_cidrs", None)
    if not ip_ranges:
        return jsonify(trusted_cidrs=ip_ranges)

    if not isinstance(ip_ranges, list):
        raise AXIllegalArgumentException("Trusted CIDRs must be a list")

    @retry_unless(status_code=[404, 422])
    def _do_update_axops(ip):
        spec = {
            "spec": {
                "loadBalancerSourceRanges": ip
            }
        }
        kubectl.api.patch_namespaced_service(spec, name="axops", namespace="axsys")

    with cfg_lock:
        cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id)
        current_trusted_cidrs = cluster_config.get_trusted_cidr()

        # Update node security groups
        cloud_util = AXBootstrap(cluster_name_id=cluster_name_id, region=cluster_config.get_region())
        try:
            cloud_util.modify_node_security_groups(
                old_cidr=current_trusted_cidrs,
                new_cidr=ip_ranges,
                action_name="UserInitiatedTrustedCidrChange"
            )
        except AWSClientError as ace:
            # In case of client error, ensure current CIDRs are reverted back.
            # The only inconsistency could be, any CIDRs that user wants to add
            # and does not trigger client error are added to node security groups
            # which is fine as long as we return proper error message to UI, and
            # leave users to fix and retry.
            # Not catching exception here because any CIDR ranges persisted to
            # cluster config should are guaranteed to be acceptable by cloud
            # provider

            # TODO (harry): not efficient here as it potentially checks CIDRs that are not removed
            cloud_util.modify_node_security_groups(
                old_cidr=[],
                new_cidr=current_trusted_cidrs,
                action_name="EnsureExistingDueToError"
            )

            if "InvalidParameterValue" in str(ace):
                raise AXIllegalArgumentException("InvalidParameterValue: {}".format(str(ace)))
            else:
                raise ace

        # Update axops security groups
        _do_update_axops(ip=ip_ranges)

        # Persist cluster config. We need to do it the last as if any of the previous
        # option fails, we should not show up the updated trusted CIDRs on UI from
        # any subsequent GET call
        cluster_config.set_trusted_cidr(cidrs=ip_ranges)
        cluster_config.save_config()

    return jsonify(trusted_cidrs=ip_ranges)
Ejemplo n.º 10
0
class ClusterPauser(ClusterOperationBase):
    def __init__(self, cfg):
        assert isinstance(cfg, ClusterPauseConfig)
        self._cfg = cfg
        super(ClusterPauser,
              self).__init__(cluster_name=self._cfg.cluster_name,
                             cluster_id=self._cfg.cluster_id,
                             cloud_profile=self._cfg.cloud_profile,
                             dry_run=self._cfg.dry_run)

        # This will raise exception if name/id mapping cannot be found
        self._name_id = self._idobj.get_cluster_name_id()
        self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id,
                                           aws_profile=self._cfg.cloud_profile)
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
        self._master_manager = AXMasterManager(
            cluster_name_id=self._name_id,
            region=self._cluster_config.get_region(),
            profile=self._cfg.cloud_profile)
        self._bootstrap_obj = AXBootstrap(
            cluster_name_id=self._name_id,
            aws_profile=self._cfg.cloud_profile,
            region=self._cluster_config.get_region())
        self._cidr = str(get_public_ip()) + "/32"

    def pre_run(self):
        if self._cluster_info.is_cluster_supported_by_portal():
            raise RuntimeError(
                "Cluster is currently supported by portal. Please login to portal to perform cluster management operations."
            )
        if self._csm.is_paused():
            logger.info("Cluster is already paused.")
            sys.exit(0)

        # This is for backward compatibility
        if not check_cluster_staging(cluster_info_obj=self._cluster_info,
                                     stage="stage2"):
            raise RuntimeError(
                "Cluster is not successfully installed: Stage2 information missing! Operation aborted."
            )
        self._csm.do_pause()
        self._persist_cluster_state_if_needed()

    def run(self):
        if self._cfg.dry_run:
            logger.info("DRY RUN: pausing cluster %s", self._name_id)
            return

        # Check if cluster's master is paused already. Since terminating master is the very last thing
        # of pausing cluster, if master is already stopped, cluster has already been successfully paused
        stopped_master = self._master_manager.discover_master(
            state=[EC2InstanceState.Stopped])
        if stopped_master:
            logger.info(
                "\n\n%sMaster %s already stopped. Cluster %s already paused%s\n",
                COLOR_GREEN, stopped_master, self._name_id, COLOR_NORM)
            return
        else:
            logger.info("\n\n%sPausing cluster %s%s\n", COLOR_GREEN,
                        self._name_id, COLOR_NORM)

        # Main pause cluster routine
        try:
            self._ensure_pauser_access()
            ensure_manifest_temp_dir()
            self._shutdown_platform()
            self._scale_down_auto_scaling_groups()
            self._wait_for_deregistering_minions()
            logger.info("Stopping master ...")
            self._master_manager.stop_master()
            logger.info("\n\n%sSuccessfully paused cluster %s%s\n",
                        COLOR_GREEN, self._name_id, COLOR_NORM)
        except Exception as e:
            logger.exception(e)
            raise RuntimeError(e)
        finally:
            self._disallow_pauser_access_if_needed()

    def post_run(self):
        self._csm.done_pause()
        self._persist_cluster_state_if_needed()

    def _wait_for_deregistering_minions(self):
        """
        This step waits for all minions to be de-registered from Kubernetes master,
        e.g. `kubectl get nodes` returns no minions besides master
        :return:
        """
        # Wait for kubernetes master de-register all minions
        logger.info(
            "Waiting for Kubernetes master to de-register all existing minions"
        )
        self._cluster_info.download_kube_config()
        kube_config = self._cluster_info.get_kube_config_file_path()
        kubectl = KubernetesApiClient(config_file=kube_config)
        while True:
            try:
                nodes = kubectl.api.list_node()
                node_names = []

                # list nodes should only show master now
                if len(nodes.items) > 1:
                    for n in nodes.items:
                        node_names.append(n.metadata.name)
                    logger.info("Remaining Kubernetes minions: %s", node_names)
                else:
                    # I don't see it necessary to check if the remaining node is master or not
                    logger.info("%sAll minions de-registered from master%s",
                                COLOR_GREEN, COLOR_NORM)
                    break
            except Exception as e:
                logger.warning("Caught exception when listing nodes: %s", e)
            time.sleep(15)

    def _scale_down_auto_scaling_groups(self):
        """
        This step:
            - Persist autoscaling group states to S3,
            - Scale down all autoscaling groups to zero,
            - Wait for all minion to be terminated
        :return:
        """
        logger.info("Discovering autoscaling groups")
        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())
        all_asgs = asg_mgr.get_all_asgs()

        # Generate cluster status before pause. This is used to recover same amount of nodes
        # when we want to restart cluster
        cluster_status = {"asg_status": {}}
        for asg in all_asgs:
            cluster_status["asg_status"][asg["AutoScalingGroupName"]] = {
                "min_size": asg["MinSize"],
                "max_size": asg["MaxSize"],
                "desired_capacity": asg["DesiredCapacity"]
            }
        self._cluster_info.upload_cluster_status_before_pause(
            status=yaml.dump(cluster_status))

        # Scale down asg
        logger.info("Scaling down autoscaling groups ...")
        for asg in all_asgs:
            asg_name = asg["AutoScalingGroupName"]
            asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0)

        # Waiting for nodes to be terminated
        logger.info("Waiting for all auto scaling groups to scale down ...")
        asg_mgr.wait_for_desired_asg_state()
        logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN,
                    COLOR_NORM)

    def _shutdown_platform(self):
        """
        This step shuts down platform based on the config and manifest provided
        :return:
        """
        logger.info("Shutting platform for pausing the cluster ...")
        self._cluster_info.download_platform_manifests_and_config(
            target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
            target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH)
        platform = AXPlatform(cluster_name_id=self._name_id,
                              aws_profile=self._cfg.cloud_profile,
                              manifest_root=TEMP_PLATFORM_MANIFEST_ROOT,
                              config_file=TEMP_PLATFORM_CONFIG_PATH)
        platform.stop()
        platform.stop_monitor()

    def _ensure_pauser_access(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[],
                new_cidr=[self._cidr],
                action_name="allow-cluster-manager")

    def _disallow_pauser_access_if_needed(self):
        if self._cidr not in self._cluster_config.get_trusted_cidr():
            logger.info(
                "Pausing cluster from a not trusted IP (%s). Disallowing access.",
                self._cidr)
            self._bootstrap_obj.modify_node_security_groups(
                old_cidr=[self._cidr],
                new_cidr=[],
                action_name="disallow-cluster-manager")