def __init__(self, cfg): assert isinstance(cfg, ClusterRestartConfig) self._cfg = cfg super(ClusterResumer, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32" self._software_info = SoftwareInfo(info_dict=yaml.load( self._cluster_info.download_cluster_software_info()))
def __init__(self, cfg, kubeconfig=None): assert isinstance(cfg, ClusterInstallConfig) self._cfg = cfg super(ClusterInstaller, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, generate_name_id=True, dry_run=self._cfg.dry_run) self._name_id = self._idobj.get_cluster_name_id() # Ensure cluster buckets before instantiating any class that uses cluster buckets # Note that AXClusterId object is an exception as we need to create cluster name_id # first, instantiating buckets, and finally upload cluster name id # TODO (#116) bucket initialization should not depend on cluster name id AXClusterBuckets(name_id=self._name_id, aws_profile=self._cfg.cloud_profile, aws_region=self._cfg.cloud_region).update() self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) if kubeconfig: self._cluster_info.set_kube_config(kubeconfig)
def __init__(self, cfg): assert isinstance(cfg, ClusterInstallConfig) self._cfg = cfg super(ClusterInstaller, self).__init__( cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile ) # This might make constructor heavy, but we need name_id to initialize critical objects # such as AXClusterConfig and AXClusterInfo. It works technically if we initialize them # to None and instantiate them when needed, but it makes programing error prone since python # is not typed. The reason is that if you initialize these objects in other functions, IDE # won't know the type of them and thus there will be no error checking about how you use them try: self._name_id = self._idobj.get_cluster_name_id() except Exception as e: logger.info("Cannot find cluster name id: %s. Cluster is not yet created.", e) self._name_id = self._idobj.create_cluster_name_id() # Ensure cluster buckets before instantiating any class that uses cluster buckets # Note that AXClusterId object is an exception as we need to create cluster name_id # first, instantiating buckets, and finally upload cluster name id # TODO (#116) bucket initialization should not depend on cluster name id AXClusterBuckets( name_id=self._name_id, aws_profile=self._cfg.cloud_profile, aws_region=self._cfg.cloud_region ).update() self._cluster_config = AXClusterConfig(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile)
def __init__(self, cfg): assert isinstance(cfg, ClusterUpgradeConfig) self._cfg = cfg super(ClusterUpgrader, self).__init__( cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile ) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region() ) self._current_software_info = SoftwareInfo( info_dict=yaml.load( self._cluster_info.download_cluster_software_info() ) ) self._cidr = str(get_public_ip()) + "/32"
def __init__(self, cluster_name_id, cloud_profile): self._cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=cloud_profile) current_state = self._cluster_info.download_cluster_current_state( ) or ClusterState.UNKNOWN self.machine = Machine(model=self, states=ClusterState.VALID_CLUSTER_STATES, initial=current_state) self._add_transitions()
def cluster(self, args): from ax.platform.ax_cluster_info import AXClusterInfo from ax.meta import AXClusterId from ax.platform_client.env import AXEnv Cloud().set_target_cloud(args.target_cloud) assert AXEnv().is_in_pod( ) or args.cluster_name, "Must specify cluster name from outside cluster" if args.subcommand in ['start', 'create']: logger.error("=" * 80) logger.error( "axtool cluster start/create has be moved to axinstaller") logger.error("=" * 80) sys.exit(1) elif args.subcommand in ['stop', 'delete']: logger.error("=" * 80) logger.error( "axtool cluster stop/delete has be moved to axinstaller") logger.error("=" * 80) sys.exit(1) elif args.subcommand == 'show': import subprocess name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() AXClusterInfo(name_id, aws_profile=args.aws_profile).download_kube_key() conf_file = AXClusterInfo( name_id, aws_profile=args.aws_profile).download_kube_config() logger.info("Kubeconfig") with open(conf_file, "r") as f: conf = f.read() logger.info("%s", conf) subprocess.call( ["kubectl", "--kubeconfig", conf_file, "cluster-info"]) subprocess.call( ["kubectl", "--kubeconfig", conf_file, "get", "no"]) subprocess.call([ "kubectl", "--kubeconfig", conf_file, "--namespace", "axsys", "get", "po" ]) elif args.subcommand == 'download-config': name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() if Cloud().target_cloud_aws(): AXClusterInfo( name_id, aws_profile=args.aws_profile).download_kube_key() AXClusterInfo(name_id, aws_profile=args.aws_profile).download_kube_config()
def __init__(self, cluster_name_id, env=None, aws_profile=None): """ :param cluster_name_id: String for cluster name_id, e.g. lcj-cluster-515d9828-7515-11e6-9b3e-a0999b1b4e15 :param env: all environment variables for kube-up and kube-down. :param aws_profile: AWS profile used to access AWS account. """ self._name_id = cluster_name_id self._aws_profile = aws_profile self._cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=aws_profile) self._kube_conf = self._cluster_info.get_kube_config_file_path() root = os.getenv("AX_KUBERNETES_ROOT") assert root, "Must set AX_KUBERNETES_ROOT to kubernetes directory" assert os.path.isdir(root), "AX_KUBERNETES_ROOT must be directory" self._kube = KubeUpDown(root, env)
def __init__(self, cfg): assert isinstance(cfg, ClusterUninstallConfig) self._cfg = cfg super(ClusterUninstaller, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32"
def __init__(self, cluster_name_id, region=None, profile=None): self.cluster_name_id = cluster_name_id # Region and profile info can be passed in with upgrade code path, # when this is run from axclustermanager outside cluster. self.region = AWSMetaData().get_region() if region is None else region self.profile = profile if profile is None: session = boto3.Session(region_name=self.region) else: session = boto3.Session(region_name=self.region, profile_name=profile) self.ec2 = session.resource('ec2') self.client = session.client('ec2') self.cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=profile) self.cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id, aws_profile=profile) cluster_config_path = AXClusterConfigPath(cluster_name_id) self.s3_bucket = cluster_config_path.bucket() self.s3_config_prefix = cluster_config_path.master_config_dir() self.s3_attributes_path = cluster_config_path.master_attributes_path() self.s3_user_data_path = cluster_config_path.master_user_data_path() logger.info( "Create MasterManager in region %s, attributes path: %s and user_data_path: %s", self.region, self.s3_attributes_path, self.s3_user_data_path) # The EC2 instance object for the current master self.master_instance = None # Properties/attributes to use when launching the new master self.attributes = {} # For upgrades. # The following values are set to None from master manager but to not None from upgrade code. self.aws_image = None self.instance_profile = None self.event_notification_client = EventNotificationClient( FACILITY_PLATFORM)
def __init__(self, input_name, cloud_profile): """ :param input_name: cluster name or <cluster_name>-<cluster_id> format :param cloud_profile: """ name_id = AXClusterId(name=input_name, aws_profile=cloud_profile).get_cluster_name_id() self.cluster_config = AXClusterConfig(cluster_name_id=name_id, aws_profile=cloud_profile) self.cluster_info = AXClusterInfo(cluster_name_id=name_id, aws_profile=cloud_profile)
class AXKubeUpDown(object): """ AX cluster bootstrap class. """ def __init__(self, cluster_name_id, env=None, aws_profile=None): """ :param cluster_name_id: String for cluster name_id, e.g. lcj-cluster-515d9828-7515-11e6-9b3e-a0999b1b4e15 :param env: all environment variables for kube-up and kube-down. :param aws_profile: AWS profile used to access AWS account. """ self._name_id = cluster_name_id self._aws_profile = aws_profile self._cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=aws_profile) self._kube_conf = self._cluster_info.get_kube_config_file_path() root = os.getenv("AX_KUBERNETES_ROOT") assert root, "Must set AX_KUBERNETES_ROOT to kubernetes directory" assert os.path.isdir(root), "AX_KUBERNETES_ROOT must be directory" self._kube = KubeUpDown(root, env) def up(self): """ Bring up cluster and save kube_config in portal. """ try: self._kube.up() finally: # Kube-up creates ssh key first. Try to save ssh key first. # We try to save keys/config (if generated) even if kube_up fails self._cluster_info.upload_kube_key() self._cluster_info.upload_kube_config() logger.info("New cluster id is %s", self._name_id) def down(self): """ Get kube config from portal and shutdown cluster based on this config. """ self._kube.down()
def __init__(self, cfg): assert isinstance(cfg, ClusterPauseConfig) self._cfg = cfg super(ClusterPauser, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) self._cidr = str(get_public_ip()) + "/32"
class ClusterUpgrader(ClusterOperationBase): def __init__(self, cfg): assert isinstance(cfg, ClusterUpgradeConfig) self._cfg = cfg super(ClusterUpgrader, self).__init__( cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile ) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile ) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region() ) self._current_software_info = SoftwareInfo( info_dict=yaml.load( self._cluster_info.download_cluster_software_info() ) ) self._cidr = str(get_public_ip()) + "/32" def run(self): self._runtime_validation() upgrade_kube = True upgrade_service = True if self._cfg.target_software_info.kube_installer_version == self._current_software_info.kube_installer_version \ and self._cfg.target_software_info.kube_version == self._current_software_info.kube_version: upgrade_kube = False if self._cfg.target_software_info.image_namespace == self._current_software_info.image_namespace \ and self._cfg.target_software_info.image_version == self._current_software_info.image_version \ and self._cfg.target_software_info.image_version != "latest" \ and not upgrade_kube: upgrade_service = False if not upgrade_service and not upgrade_kube and not self._cfg.force_upgrade: logger.info("%sCluster's software versions is not changed, not performing upgrade.%s", COLOR_GREEN, COLOR_NORM) logger.info("%sIf you want to force upgrade cluster, please specify --force-upgrade flag.%s", COLOR_YELLOW, COLOR_NORM) return if self._cfg.dry_run: logger.info("DRY RUN: upgrading cluster %s", self._name_id) return upgrade_info = " Software Image: {}:{} -> {}:{}\n".format( self._current_software_info.image_namespace, self._current_software_info.image_version, self._cfg.target_software_info.image_namespace, self._cfg.target_software_info.image_version ) upgrade_info += " Kubernetes: {} -> {}\n".format( self._current_software_info.kube_version, self._cfg.target_software_info.kube_version ) upgrade_info += " Kubernetes Installer: {} -> {}".format( self._current_software_info.kube_installer_version, self._cfg.target_software_info.kube_installer_version ) logger.info("\n\n%sUpgrading cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM) # Main pause cluster routine try: self._ensure_credentials() self._ensure_upgrader_access() ensure_manifest_temp_dir() if upgrade_service: self._shutdown_platform() if upgrade_kube: self._upgrade_kube() if upgrade_service: self._start_platform() self._cluster_info.upload_platform_manifests_and_config( platform_manifest_root=self._cfg.manifest_root, platform_config=self._cfg.bootstrap_config ) logger.info("\n\n%sSuccessfully upgraded cluster %s:\n\n%s%s\n", COLOR_GREEN, self._name_id, upgrade_info, COLOR_NORM) except Exception as e: logger.exception(e) raise RuntimeError(e) finally: self._disallow_upgrader_access_if_needed() def _runtime_validation(self): all_errs = [] # Abort operation if cluster is not successfully installed if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"): all_errs.append("Cannot upgrade cluster that is not successfully installed: Stage2 information missing!") cluster_status_raw = self._cluster_info.download_cluster_status_before_pause() if cluster_status_raw: all_errs.append("Upgrading a paused cluster is not currently supported. Please restart it first") # Abort operation if registry information changed if self._cfg.target_software_info.registry != self._current_software_info.registry \ or self._cfg.target_software_info.registry_secrets != self._current_software_info.registry_secrets: all_errs.append("Changing registry information during upgrade is not supported currently!") # Abort operation if ami information changed if self._cfg.target_software_info.ami_name != self._current_software_info.ami_name \ or (self._cfg.target_software_info.ami_id and self._cfg.target_software_info.ami_id != self._current_software_info.ami_id): all_errs.append("Upgrading AMI information is not currently supported.") if all_errs: raise RuntimeError("Upgrade aborted. Error(s): {}".format(all_errs)) def _ensure_credentials(self): self._cluster_info.download_kube_config() self._cluster_info.download_kube_key() def _shutdown_platform(self): """ This step shuts down platform based on the config and manifest provided :return: """ logger.info("Shutting Argo platform ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH ) platform = AXPlatform( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH ) platform.stop() platform.stop_monitor() def _upgrade_kube(self): """ This function calls our script to upgrade Kubernetes and cluster nodes :return: """ env = { "CLUSTER_NAME_ID": self._name_id, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "OLD_KUBE_VERSION": self._current_software_info.kube_version, "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version, "NEW_CLUSTER_INSTALL_VERSION": self._cfg.target_software_info.kube_installer_version, "ARGO_AWS_REGION": self._cluster_config.get_region() } if self._cfg.cloud_profile: env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile else: env["ARGO_AWS_PROFILE"] = AWS_DEFAULT_PROFILE logger.info("Upgrading Kubernetes with environments %s", pformat(env)) env.update(os.environ) subprocess.check_call(["upgrade-kubernetes"], env=env) def _start_platform(self): """ This step brings up Argo platform services :return: """ logger.info("Bringing up Argo platform ...") platform = AXPlatform( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=self._cfg.manifest_root, config_file=self._cfg.bootstrap_config, software_info=self._cfg.target_software_info ) platform.start() platform.stop_monitor() def _ensure_upgrader_access(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info("Upgrading cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[], new_cidr=[self._cidr], action_name="allow-cluster-manager" ) def _disallow_upgrader_access_if_needed(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info("Upgrading cluster from a not trusted IP (%s). Disallowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[self._cidr], new_cidr=[], action_name="disallow-cluster-manager" )
class ClusterStateMachine(object): def __init__(self, cluster_name_id, cloud_profile): self._cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=cloud_profile) current_state = self._cluster_info.download_cluster_current_state( ) or ClusterState.UNKNOWN self.machine = Machine(model=self, states=ClusterState.VALID_CLUSTER_STATES, initial=current_state) self._add_transitions() def _add_transitions(self): self.machine.add_transition( trigger="do_install", source=[ClusterState.UNKNOWN, ClusterState.INSTALLING], dest=ClusterState.INSTALLING) self.machine.add_transition(trigger="done_install", source=ClusterState.INSTALLING, dest=ClusterState.RUNNING) self.machine.add_transition(trigger="do_pause", source=[ ClusterState.UNKNOWN, ClusterState.RUNNING, ClusterState.PAUSING ], dest=ClusterState.PAUSING) self.machine.add_transition(trigger="done_pause", source=ClusterState.PAUSING, dest=ClusterState.PAUSED) self.machine.add_transition(trigger="do_resume", source=[ ClusterState.UNKNOWN, ClusterState.PAUSED, ClusterState.RESUMING ], dest=ClusterState.RESUMING) self.machine.add_transition(trigger="done_resume", source=ClusterState.RESUMING, dest=ClusterState.RUNNING) self.machine.add_transition(trigger="do_upgrade", source=[ ClusterState.UNKNOWN, ClusterState.RUNNING, ClusterState.UPGRADING ], dest=ClusterState.UPGRADING) self.machine.add_transition(trigger="done_upgrade", source=ClusterState.UPGRADING, dest=ClusterState.RUNNING) self.machine.add_transition( trigger="do_uninstall", source=[ ClusterState.RUNNING, ClusterState.UNKNOWN, ClusterState.UPGRADING, ClusterState.PAUSING, ClusterState.PAUSED, ClusterState.RESUMING, ClusterState.UNINSTALLING, ClusterState.INSTALLING ], dest=ClusterState.UNINSTALLING) @property def current_state(self): return self.state def is_installing(self): return self.current_state == ClusterState.INSTALLING def is_running(self): return self.current_state == ClusterState.RUNNING def is_pausing(self): return self.current_state == ClusterState.PAUSING def is_paused(self): return self.current_state == ClusterState.PAUSED def is_upgrading(self): return self.current_state == ClusterState.UPGRADING def is_resuming(self): return self.current_state == ClusterState.RESUMING def is_uninstalling(self): return self.current_state == ClusterState.UNINSTALLING def is_unknown(self): return self.current_state == ClusterState.UNKNOWN def persist_state(self): self._cluster_info.upload_cluster_current_state(self.current_state)
class ClusterInstaller(ClusterOperationBase): def __init__(self, cfg, kubeconfig=None): assert isinstance(cfg, ClusterInstallConfig) self._cfg = cfg super(ClusterInstaller, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, generate_name_id=True, dry_run=self._cfg.dry_run) self._name_id = self._idobj.get_cluster_name_id() # Ensure cluster buckets before instantiating any class that uses cluster buckets # Note that AXClusterId object is an exception as we need to create cluster name_id # first, instantiating buckets, and finally upload cluster name id # TODO (#116) bucket initialization should not depend on cluster name id AXClusterBuckets(name_id=self._name_id, aws_profile=self._cfg.cloud_profile, aws_region=self._cfg.cloud_region).update() self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) if kubeconfig: self._cluster_info.set_kube_config(kubeconfig) def pre_run(self): if self._csm.is_running(): logger.info( "Cluster is already installed and running. Please ask your administrator" ) sys.exit(0) self._csm.do_install() self._persist_cluster_state_if_needed() def post_run(self): self._csm.done_install() self._persist_cluster_state_if_needed() def persist_username_password_locally(self, username, password, cluster_dns): # Dump Argo cluster profile if username and password: logger.info("Generating Argo cluster profile ...") argo_config_path = ARGO_CONFIG.format( fname=self._idobj.get_cluster_name_id()) with open(argo_config_path, "w") as f: f.write(""" insecure: true password: {password} url: https://{dns} username: {username} """.format(password=password, dns=cluster_dns, username=username)) if not os.path.exists(ARGO_CONFIG_DEFAULT): # if user has not yet configured default argo config, symlink a default config to the one just created os.symlink(os.path.basename(argo_config_path), ARGO_CONFIG_DEFAULT) summary = """ Cluster Name: {cluster_name} Cluster ID: {cluster_id} Cluster Profile Name: {name_id} Cluster DNS: {dns} Initial Username: {username} Initial Password: {password} Note if your username and password are empty, your cluster has already been successfully installed before. In this case, your argo CLI profile is NOT configured, as we only generate initial username and password once, please contact your administrator for more information to configure your argo CLI profile. """.format(cluster_name=self._idobj.get_cluster_name(), cluster_id=self._idobj.get_cluster_id(), name_id=self._name_id, dns=cluster_dns, username=username, password=password) logger.info("Cluster information:\n%s%s%s\n", COLOR_GREEN, summary, COLOR_NORM) def run(self): """ Main install routine :return: """ self._pre_install() self._ensure_kubernetes_cluster() if self._cfg.dry_run: logger.info("DRY RUN: not installing cluster") return cluster_dns, username, password = self._ensure_argo_microservices() self.persist_username_password_locally(username, password, cluster_dns) def _pre_install(self): """ Pre install ensures the following stuff: - Cluster name/id mapping is created and uploaded - A local copy of cluster config is generated - Upload stage0 information to S3 Stage0 is an indication of the fact that at least some part of the cluster could have been created. This step is idempotent. :return: """ if check_cluster_staging(self._cluster_info, "stage0"): logger.info("Skip pre install") return logger.info("Cluster installation step: Pre Install") # After buckets are ensured, we persist cluster name id information # There is no problem of re-uploading we rerun this step self._idobj.upload_cluster_name_id() # Generate raw config dict raw_cluster_config_dict = self._generate_raw_cluster_config_dict() # Set cluster config object with raw cluster config dict self._cluster_config.set_config(raw_cluster_config_dict) # Prepare configuration for kube installer. This call will write prepare_kube_install_config(name_id=self._name_id, aws_profile=self._cfg.cloud_profile, cluster_info=self._cluster_info, cluster_config=self._cluster_config) # Save config file to s3, which is also stage0 information self._cluster_config.save_config() logger.info( "Cluster installation step: Pre Install successfully finished") def _ensure_kubernetes_cluster(self): """ This step won't run if there is "--dry-run" specified. This step assumes pre-install is already finished. This step does the following: - Config kube-installer - Call kube-installer to create Kubernetes cluster - Persist cluster credentials (Kubeconfig file and ssh key) to S3 - Upload finalized cluster config and cluster metadata to S3 - Upload stage1 information to S3 Stage1 is an indication of the fact that there is a kubernetes cluster ready, and we can create micro-services on it. This step is NOT necessarily idempotent: e.g., if you created master but install failed due to cloud provider rate limit, and as a result, you have not yet created minions, for some reasons you quit your cluster manager container, all your cluster credentials can be lost. So if this step fails, the safest way is to uninstall the half installed cluster and start another install :return: """ if check_cluster_staging(self._cluster_info, "stage1"): logger.info("Skip ensure Kubernetes cluster") return logger.info("Cluster installation step: Ensure Kubernetes Cluster") # Reload config in case stage0 is skipped self._cluster_config.reload_config() logger.info("Creating cluster with config: \n\n%s\n", pformat(self._cluster_config.get_raw_config())) # if dry-run is specified, this step should be skipped if self._cfg.dry_run: return # Call kube-up logger.info("\n\n%sCalling kube-up ...%s\n", COLOR_GREEN, COLOR_NORM) AXKubeUpDown(cluster_name_id=self._name_id, env=self._cluster_config.get_kube_installer_config(), aws_profile=self._cfg.cloud_profile).up() # kube-up will generate cluster metadata. We add information from cluster metadata into cluster config logger.info("Loading cluster meta into cluster config ...") with open(CLUSTER_META_DATA_PATH, "r") as f: data = f.read() cluster_meta = yaml.load(data) self._cluster_config.load_cluster_meta(cluster_meta) # Persist updated cluster config self._cluster_config.save_config() # Upload cluster metadata self._cluster_info.upload_cluster_metadata() # Finally persist stage1 self._cluster_info.upload_staging_info(stage="stage1", msg="stage1") logger.info( "Cluster installation step: Ensure Kubernetes Cluster successfully finished" ) def install_and_run_platform(self): logger.info("Starting platform install") # Install Argo micro-services # Platform install platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=self._cfg.manifest_root, config_file=self._cfg.bootstrap_config) install_platform_failed = False install_platform_failure_message = "" try: platform.start() platform.stop_monitor() except Exception as e: logger.exception(e) install_platform_failed = True install_platform_failure_message = str( e ) + "\nPlease manually check the cluster status and retry installation with same command if the error is transient." if install_platform_failed: raise RuntimeError(install_platform_failure_message) # In case platform is successfully installed, # connect to axops to get initial username and password username, password = self._get_initial_cluster_credentials() logger.info("Done with platform install") return platform.cluster_dns_name, username, password def post_install(self): # Persist manifests to S3 self._cluster_info.upload_platform_manifests_and_config( platform_manifest_root=self._cfg.manifest_root, platform_config=self._cfg.bootstrap_config) # Finally persist stage2 information self._cluster_info.upload_staging_info(stage="stage2", msg="stage2") logger.info( "Cluster installation step: Ensure Argo Micro-services successfully finished" ) def _ensure_argo_microservices(self): """ This step won't run if there is "--dry-run" specified. This step assumes there is a running Kubernetes cluster. This step does the following: - ensure ASG count - ensure trusted CIDRs - install Argo software on to the cluster and make sure they are up and running (We don't monitor if the microservice is having a crash loop) - Remove manager CIDR if it is not part of user-specified trusted CIDRs - Upload stage2 information to S3 Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop This step is idempotent :return: cluster_dns_name, username, password """ logger.info("Cluster installation step: Ensure Argo Micro-services") # Reload config in case stage0 and stage1 are skipped self._cluster_config.reload_config() trusted_cidrs = self._cluster_config.get_trusted_cidr() # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to # perform using that object. axbootstrap = AXBootstrap(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # We allow access from everywhere during installation phase, but will remove this access # if user does not specify 0.0.0.0/0 as their trusted CIDR axbootstrap.modify_node_security_groups(old_cidr=[], new_cidr=trusted_cidrs + [EC2IPPermission.AllIP], action_name="allow-creator") if check_cluster_staging(self._cluster_info, "stage2"): # TODO: some duplicated logic here, might need to combine them. logger.info( "Skip ensure Argo micro-services since cluster has already been successfully installed" ) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") return platform.get_cluster_external_dns(), "", "" # Modify ASG axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = int( self._cluster_config.get_min_node_count()) - axsys_node_count axuser_max_count = int( self._cluster_config.get_max_node_count()) - axsys_node_count axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count) cluster_dns, username, password = self.install_and_run_platform() self.post_install() # Remove access from 0.0.0.0/0 if this is not what user specifies if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") return cluster_dns_name, username, password @retry(wait_fixed=5, stop_max_attempt_number=10) def _get_initial_cluster_credentials(self): """ This functions connects to axops pod to get cluster's initial credentials :return: (username, password) """ # TODO: a less hacky way of getting initial credentials? ns_conf = "--namespace axsys --kubeconfig {config}".format( config=self._cluster_info.get_kube_config_file_path()) cmd = "kubectl " + ns_conf + " exec $(kubectl " + ns_conf + " get pods -l app=axops-deployment | grep axops | awk '{print $1}') /axops/bin/axpassword -c axops" ret = subprocess.check_output(cmd, shell=True) username = None password = None for line in ret.split("\n"): if line.startswith("Username"): # Username line has format "Username: xxxxxxx" username = line[len("Username: "******"Password"): # Password line has format "Password: xxxxxx" password = line[len("Password: "******"Failed to get username and password from axops pod: {}".format( ret) return username, password def _generate_raw_cluster_config_dict(self): """ This is a standalone method to generate cluster config dictionary based on install config. We might want to move it to ax.platform.cluster_config package for sanity :return: """ config_file_name = CLUSTER_CONFIG_TEMPLATES[self._cfg.cluster_size] config_file_full_path = os.path.join( *[CLUSTER_CONFIG_ROOT, self._cfg.cluster_type, config_file_name]) with open(config_file_full_path, "r") as f: config = json.load(f) if Cloud().target_cloud_aws(): return self._generate_raw_cluster_config_dict_aws(config) elif Cloud().target_cloud_gcp(): return self._generate_raw_cluster_config_dict_gcp(config) else: # Should never come here as aws/gcp is ensured at CLI validation level return config def _generate_raw_cluster_config_dict_aws(self, config): """ Generate AWS specific cluster config. :param config: :return: """ # TODO: once we support installing with config file, we only overwrite when item is specifically set through CLI config["cloud"]["configure"]["region"] = self._cfg.cloud_region config["cloud"]["configure"]["placement"] = self._cfg.cloud_placement config["cloud"]["trusted_cidr"] = self._cfg.trusted_cidrs config["cloud"]["vpc_id"] = self._cfg.vpc_id # If we install into existing VPC, i.e. vpc_id is not None, or we are going to fetch it # from cluster metadata after cluster is created. config["cloud"][ "vpc_cidr_base"] = self._cfg.vpc_cidr_base if not self._cfg.vpc_id else None config["cloud"]["subnet_size"] = self._cfg.subnet_mask_size config["cloud"]["configure"][ "sandbox_enabled"] = self._cfg.enable_sandbox # TODO (#119): might want to remove this filed as this was used for hacks before. Setting it to "dev" for now config["cloud"]["configure"]["cluster_user"] = "******" # TODO (#117): Switch all spot related options by literals rather than true/false and some other hacks # also need to revise the need of specifying a spot price during installation if self._cfg.spot_instances_option in [ SpotInstanceOption.PARTIAL_SPOT, SpotInstanceOption.ALL_SPOT ]: spot_instances_enabled = "true" else: spot_instances_enabled = "false" config["cloud"]["configure"][ "spot_instances_enabled"] = spot_instances_enabled config["cloud"]["configure"][ "spot_instances_option"] = self._cfg.spot_instances_option config["cloud"]["node_spot_price"] = DEFAULT_NODE_SPOT_PRICE # Configure master axsys_node_type = config["cloud"]["configure"]["axsys_node_type"] axsys_node_max = config["cloud"]["configure"]["axsys_node_count"] axuser_node_type = config["cloud"]["configure"]["axuser_node_type"] axuser_node_max = config["cloud"]["configure"][ "max_node_count"] - axsys_node_max cluster_type = config["cloud"]["configure"]["cluster_type"] if self._cfg.cluster_size != AXClusterSize.CLUSTER_USER_PROVIDED: master_config = KubeMasterResourceConfig( usr_node_type=axuser_node_type, usr_node_max=axuser_node_max, ax_node_type=axsys_node_type, ax_node_max=axsys_node_max, cluster_type=cluster_type) if self._cfg.cluster_size == AXClusterSize.CLUSTER_MVC: # MVC cluster does not follow the heuristics we used to configure master config["cloud"]["configure"]["master_type"] = "m3.xlarge" else: config["cloud"]["configure"][ "master_type"] = master_config.master_instance_type config["cloud"]["configure"][ "master_config_env"] = master_config.kube_up_env # TODO (#121) Need to revise the relationship between user_on_demand_nodes and node minimum, system node count config["cloud"]["configure"][ "axuser_on_demand_nodes"] = self._cfg.user_on_demand_nodes # Get AMI information ami_name = self._cfg.software_info.ami_name ami_id = AMI(aws_profile=self._cfg.cloud_profile, aws_region=self._cfg.cloud_region).get_ami_id_from_name( ami_name=ami_name) config["cloud"]["configure"]["ami_id"] = ami_id # Other configurations config["cloud"]["configure"]["autoscaler_scan_interval"] = str( self._cfg.autoscaling_interval) + "s" config["cloud"]["configure"]["support_object_store_name"] = str( self._cfg.support_object_store_name) return config def _generate_raw_cluster_config_dict_gcp(self, config): """ Generate GCP specific cluster config. :param config: :return: """ config["cloud"]["trusted_cidr"] = self._cfg.trusted_cidrs return config def update_and_save_config(self, cluster_bucket=None): """ Update the config to use the given bucket and upload cluster_config and kubeconfig to the given bucket. """ raw_cluster_config_dict = self._generate_raw_cluster_config_dict() self._cluster_config.set_config(raw_cluster_config_dict) self._cluster_config.set_cluster_provider(ClusterProvider.USER) self._cluster_config.set_support_object_store_name(cluster_bucket) # Save config file to s3. self._cluster_config.save_config() self._cluster_info.upload_kube_config()
class AXMasterManager: def __init__(self, cluster_name_id, region=None, profile=None): self.cluster_name_id = cluster_name_id # Region and profile info can be passed in with upgrade code path, # when this is run from axclustermanager outside cluster. self.region = AWSMetaData().get_region() if region is None else region self.profile = profile if profile is None: session = boto3.Session(region_name=self.region) else: session = boto3.Session(region_name=self.region, profile_name=profile) self.ec2 = session.resource('ec2') self.client = session.client('ec2') self.cluster_info = AXClusterInfo(cluster_name_id=cluster_name_id, aws_profile=profile) self.cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id, aws_profile=profile) cluster_config_path = AXClusterConfigPath(cluster_name_id) self.s3_bucket = cluster_config_path.bucket() self.s3_config_prefix = cluster_config_path.master_config_dir() self.s3_attributes_path = cluster_config_path.master_attributes_path() self.s3_user_data_path = cluster_config_path.master_user_data_path() logger.info("Create MasterManager in region %s, attributes path: %s and user_data_path: %s", self.region, self.s3_attributes_path, self.s3_user_data_path) # The EC2 instance object for the current master self.master_instance = None # Properties/attributes to use when launching the new master self.attributes = {} # For upgrades. # The following values are set to None from master manager but to not None from upgrade code. self.aws_image = None self.instance_profile = None self.event_notification_client = EventNotificationClient(FACILITY_PLATFORM) @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=5) def discover_master(self, state=None): """ Discover's the currently running master for the given cluster name. """ if not state: state = [EC2InstanceState.Running] response = self.client.describe_instances( Filters=[ {'Name': 'tag:Name', 'Values': [self.cluster_name_id + '-master']}, {'Name': 'instance-state-name', 'Values': state} ] ) # Confirm that there is only 1 master if len(response['Reservations']) == 0: logger.info("Master with state %s not found", state) return None assert len(response['Reservations']) == 1, "More than 1 master running (reservations != 1)!" assert len(response['Reservations'][0]['Instances']) == 1, "Not exactly 1 master instance is running! {}".format(response['Reservations'][0]['Instances']) return response['Reservations'][0]['Instances'][0]['InstanceId'] def user_data_fixup(self, user_data): """ The original user-data used for creating the master can become obsolete after upgrades. There are 5 fields from the original user-data that need to be "fixed". The SERVER_BINARY_TAR_URL, SALT_TAR_URL, SERVER_BINARY_TAR_HASH, SALT_TAR_HASH and the wget command that downloads the bootstrap-script. """ from ax.platform.kube_env_config import kube_env_update # TODO: It's not ideal to use env variables for passing arguments. # Env variables could be different between running as server and from upgrade. kube_version = os.getenv('KUBE_VERSION', os.getenv('NEW_KUBE_VERSION')).strip() cluster_install_version = os.getenv('AX_CLUSTER_INSTALL_VERSION', os.getenv('NEW_CLUSTER_INSTALL_VERSION')).strip() server_binary_tar_hash = os.getenv('SERVER_BINARY_TAR_HASH', os.getenv('NEW_KUBE_SERVER_SHA1')).strip() salt_tar_hash = os.getenv('SALT_TAR_HASH', os.getenv('NEW_KUBE_SALT_SHA1')).strip() updates = { "new_kube_version": kube_version, "new_cluster_install_version": cluster_install_version, "new_kube_server_hash": server_binary_tar_hash, "new_kube_salt_hash": salt_tar_hash, "new_api_servers": self.attributes['private_ip_address'], } dec = zlib.decompressobj(32 + zlib.MAX_WBITS) # offset 32 to skip the header unzipped_user_data = dec.decompress(base64.b64decode(user_data)) # Zip output buffer. For details: http://bit.ly/2gv3WKt comp = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16) zipped_data = comp.compress(kube_env_update(unzipped_user_data, updates)) + comp.flush() # Convert output to base64 encoded logger.info("User data fixup completed") return base64.b64encode(zipped_data) @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def get_user_data(self): """ Get's the user-data for the current master. Note that the user-data is base64 encoded when it is downloaded. Writes the data into a file. """ # The user-data is base64 encoded. user_data = self.client.describe_instance_attribute( Attribute='userData', InstanceId=self.master_instance.instance_id)['UserData']['Value'] # Download user-data and store it into a temporary file. This data is base64 encoded. # It is better to use a well-known location for this file rather than one generated by mkstemp (or variants). # That way, this file could be populated the first time this pod run or even later by simply downloading # the user-data from S3. try: user_data = self.user_data_fixup(user_data) except Exception as e: logger.exception("Failed while fixing up user-data") raise AXPlatformException("Failed while fixing up user-data: " + str(e)) with open(USER_DATA_FILE_NEW, "w") as f: f.write(user_data) return USER_DATA_FILE_NEW @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def get_master_pd_volume_metadata(self): """ Get's the metadata for the Master's persistent disk (EBS volume). """ volume_metadata = self.client.describe_volumes( Filters=[ {'Name': 'attachment.instance-id', 'Values': [self.master_instance.instance_id,]}, {'Name': 'tag:Name', 'Values' : [self.cluster_name_id + "-master-pd"]} ]) assert volume_metadata is not None, "Failed to retries volume_metadata" return volume_metadata @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def get_route_table_id(self): """ Get's the route table used by the given cluster. """ route_table_id = None response = self.client.describe_route_tables( Filters=[{'Name': 'tag:KubernetesCluster', 'Values': [self.cluster_name_id]}]) assert len(response["RouteTables"]) == 1, "There should be a single route-table!" assert "RouteTableId" in response["RouteTables"][0], "RouteTableId not in response" route_table_id = response["RouteTables"][0]["RouteTableId"] logger.debug("Using route-table-id %s", route_table_id) return route_table_id @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def get_root_dev_attrs(self): assert self.master_instance, "Master instance not set" root_dev_id = None for dev in self.master_instance.block_device_mappings: if dev['DeviceName'] == self.master_instance.root_device_name: root_dev_id = dev['Ebs']['VolumeId'] dev_metadata = self.client.describe_volumes(VolumeIds=[root_dev_id]) root_dev_size = dev_metadata['Volumes'][0]['Size'] root_dev_type = dev_metadata['Volumes'][0]['VolumeType'] break assert self.master_instance.root_device_name and str(root_dev_size) and root_dev_type, "Failed to get root device attributes" logger.info("Root device attributes: %s, %s, %s", self.master_instance.root_device_name, str(root_dev_size), root_dev_type) self.attributes['root_dev_name'] = self.master_instance.root_device_name self.attributes['root_dev_size'] = str(root_dev_size) self.attributes['root_dev_type'] = root_dev_type def populate_attributes(self): """ Collects attributes that will be persisted and used for spinning up the new master instance. Populates the "attributes" member dict with all the values. """ # Upgrade might overwrite these attributes. Use them if set. # Otherwise get them from existing master instance. image_id = self.aws_image if self.aws_image else self.master_instance.image_id instance_profile = self.instance_profile if self.instance_profile else self.master_instance.iam_instance_profile["Arn"] self.attributes['image_id'] = image_id self.attributes['instance_type'] = self.master_instance.instance_type self.attributes['vpc_id'] = self.master_instance.vpc_id self.attributes['key_name'] = self.master_instance.key_name self.attributes['placement'] = self.master_instance.placement self.attributes['arn'] = instance_profile self.attributes['subnet_id'] = self.master_instance.subnet_id self.attributes['private_ip_address'] = self.master_instance.private_ip_address target_sgs = [] for sg in self.master_instance.security_groups: target_sgs.append(sg["GroupId"]) self.attributes['security_group_ids'] = target_sgs self.attributes['user_data_file'] = self.get_user_data() self.attributes['master_tags'] = self.master_instance.tags # Retrieve master-pd and master-eip from the volume_metadata volume_metadata = self.get_master_pd_volume_metadata() if volume_metadata['Volumes'] and volume_metadata['Volumes'][0]: if volume_metadata['Volumes'][0]['VolumeId']: vol_id = volume_metadata['Volumes'][0]['VolumeId'] self.attributes['master_pd_id'] = vol_id self.attributes['master_pd_device'] = volume_metadata['Volumes'][0]['Attachments'][0]['Device'] # Retrieve tags of master-pd. Get EIP from master. for tag in volume_metadata['Volumes'][0]['Tags']: if tag['Key'] == "kubernetes.io/master-ip": master_eip = tag["Value"] self.attributes['master_eip'] = master_eip break assert self.attributes['master_pd_id'] is not None, "Failed to find Master's persistent disk" assert self.attributes['master_pd_device'] is not None, "Failed to find attachment info for Master's persistent disk" assert self.attributes['master_eip'] is not None, "Failed to find Master's Elastic IP" self.attributes['route_table_id'] = self.get_route_table_id() self.attributes['pod_cidr'] = self.cluster_config.get_master_pod_cidr() self.attributes['ebs_optimized'] = self.master_instance.ebs_optimized # Get root device attributes self.get_root_dev_attrs() @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def add_tags(self, instance): """ Adds tags to the new master instance. :param instance: The new master ec2 instance. """ response = self.client.create_tags( Resources=[instance.instance_id], Tags=self.attributes['master_tags'] ) logger.info("Attached tags to new master") @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def attach_eip(self, instance): """ Attaches the EIP to the master instance. :param instance: The new master ec2 instance. """ eip_meta = self.client.describe_addresses(PublicIps=[self.attributes['master_eip']]) assert eip_meta is not None, "Failed to get details about EIP " + self.attributes['master_eip'] assert eip_meta['Addresses'] and len(eip_meta['Addresses']) == 1, "Error getting EIP address details" response = self.client.associate_address( InstanceId=instance.instance_id, AllocationId=eip_meta['Addresses'][0]['AllocationId'], AllowReassociation=True ) logger.info("Attached EIP to new master: %s", response['ResponseMetadata']['HTTPStatusCode']) @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def attach_volume(self, instance): """ Attaches the EBS volume to the master instance. :param instance: The new master ec2 instance. """ response = instance.attach_volume( VolumeId=self.attributes['master_pd_id'], Device=self.attributes['master_pd_device']) logger.info("Attached volume to new master: %s", response['ResponseMetadata']['HTTPStatusCode']) @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def replace_route(self, instance): response = self.client.replace_route( RouteTableId=self.attributes['route_table_id'], DestinationCidrBlock=self.attributes['pod_cidr'], InstanceId=instance.instance_id ) logger.info("Replaced master route %s with %s: %s", self.attributes['pod_cidr'], instance.instance_id, response['ResponseMetadata']['HTTPStatusCode']) @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=5) def run_new_master(self, user_data): """ Uses the boto APIs to run new instances of the master. Retries in case of failure. :param user_data: The user-data to use for the new instance. """ try: response = self.client.run_instances( ImageId=self.attributes['image_id'], MinCount=1, MaxCount=1, KeyName=self.attributes['key_name'], UserData=user_data, InstanceType=self.attributes['instance_type'], Placement=self.attributes['placement'], IamInstanceProfile={"Arn": self.attributes['arn']}, NetworkInterfaces=[ { 'DeviceIndex': 0, 'SubnetId': self.attributes['subnet_id'], 'PrivateIpAddress': self.attributes['private_ip_address'], 'AssociatePublicIpAddress': True, 'Groups': self.attributes['security_group_ids'] }, ], BlockDeviceMappings=[ { 'DeviceName': self.attributes['root_dev_name'], 'Ebs': { 'VolumeSize': int(self.attributes['root_dev_size']), 'VolumeType': self.attributes['root_dev_type'] } }, # Ephemeral devices to match kube-up behavior to get SSD attached. { 'DeviceName': '/dev/sdc', 'VirtualName': 'ephemeral0' }, { 'DeviceName': '/dev/sdd', 'VirtualName': 'ephemeral1' }, { 'DeviceName': '/dev/sde', 'VirtualName': 'ephemeral2' }, { 'DeviceName': '/dev/sdf', 'VirtualName': 'ephemeral3' }, ], EbsOptimized=self.attributes['ebs_optimized'] ) return response except Exception as e: logger.exception("Error running instances: %s", str(e)) def launch_new_master(self): """ Launches the new master instance. """ logger.info("Launching new master ...") # Read the base64 encoded data and decode it before using it. AWS will # base64 encode it again. with open(self.attributes['user_data_file'], 'r') as user_data_file: user_data = base64.b64decode(user_data_file.read()) response = self.run_new_master(user_data) new_master_id = response["Instances"][0]['InstanceId'] logger.info("Waiting for new master %s to start", new_master_id) new_master = self.ec2.Instance(new_master_id) # Each call to ec2_instance.wait_until_running below will wait for a max of 15 minutes. # Give enough time for the instance to start... counter = 0 while (counter < 2): try: new_master.wait_until_running() counter = counter + 1 except botocore.exceptions.WaiterError as we: logger.debug("Still waiting for new master to run...") pass logger.info("New master with instance id %s is up!", new_master.instance_id) self.add_tags(new_master) self.attach_eip(new_master) self.attach_volume(new_master) self.replace_route(new_master) return new_master @retry(wait_fixed=2000) def wait_for_termination(self): """ Waits the termination of the currently running master instance. """ # check if master api server is alive and if not terminate master try: logger.info("Checking if Master API server is alive...") self.check_master_api_server() logger.info("Master API server is alive...") except Exception as e: if isinstance(e, urllib3.exceptions.HTTPError): logger.error("Got the following exception while trying to check master api server {}".format(e)) logger.info("Assuming master is bad and terminating it...") self.terminate_master() logger.info("Done terminating master") return else: logger.warn("Got the following error from Kubernetes Master API Server {}. Looks like it is alive so ignoring this temporary error".format(e)) logger.debug("Waiting for master termination signal ...") self.master_instance.wait_until_terminated() logger.info("Master down!") @retry(wait_exponential_multiplier=2000, stop_max_attempt_number=3, retry_on_exception=print_exception) def terminate_master(self): """ Terminate current master instance and wait until it's done. """ logger.info("Terminating master %s.", self.master_instance) self.client.terminate_instances(InstanceIds=[self.master_instance.instance_id]) self.master_instance.wait_until_terminated() @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=3) def stop_master(self): stop_master_requested = False master_instance_id = self.discover_master(state=[EC2InstanceState.Stopping, EC2InstanceState.Stopped]) if master_instance_id: stop_master_requested = True if not stop_master_requested: master_instance_id = self.discover_master(state=["*"]) if not master_instance_id: raise AXPlatformException("Cannot find master instance") try: self.client.stop_instances(InstanceIds=[master_instance_id]) except ClientError as ce: if "UnsupportedOperation" in str(ce) and "StopInstances" in str(ce): logger.warning("Master instance %s a spot instance, which cannot be stopped.") return elif "IncorrectInstanceState" in str(ce): # Master could be in "terminating", "terminated", or "stopped" state. It does not # make sense that first 2 states could kick in, unless there is some human intervention # so the code will stuck in waiting for master to go into "stopped" state, which is # a good indication for checking manually pass else: raise ce logger.info("Waiting for master %s to get into state \"stopped\"", master_instance_id) while True: stopped_master = self.discover_master(state=[EC2InstanceState.Stopped]) if stopped_master: logger.info("Master %s successfully stopped", master_instance_id) return else: time.sleep(5) @retry(retry_on_exception=default_aws_retry, wait_exponential_multiplier=2000, stop_max_attempt_number=3) def restart_master(self): started_master_id = self.discover_master(state=[EC2InstanceState.Running]) if started_master_id: logger.info("Master %s is already running", started_master_id) return stopped_master_id = self.discover_master(state=[EC2InstanceState.Stopped]) if not stopped_master_id: raise AXPlatformException("Cannot find a previously stopped master instance") # As we can always start a "stopped" instance, any other exception will be thrown out self.client.start_instances(InstanceIds=[stopped_master_id]) logger.info("Waiting for master %s to get into state \"running\"", stopped_master_id) while True: running_master_id = self.discover_master(state=[EC2InstanceState.Running]) if running_master_id: logger.info("Master %s successfully started", running_master_id) return else: time.sleep(5) def save_master_config(self, file_path): """ Uploads the master attributes and user-data (in base64encoded format) into a directory in the s3 bucket. """ with open(file_path, 'r') as user_data_file: user_data = user_data_file.read() self.cluster_info.upload_master_config_to_s3(self.attributes, user_data) def user_data_updated(self): """ Get both old and new user data file content and compare them. Return True if they are different. """ with open(USER_DATA_FILE_S3, "r") as f: old = f.read() with open(USER_DATA_FILE_NEW, "r") as f: new = f.read() return old != new def send_notification(self, code, message): try: self.event_notification_client.send_message_to_notification_center( code, detail={'message': "[master_manager] " + message}) except Exception as exc: logger.exception("Failed to send event to notification center: %s", exc) return def run(self): """ The main method for the MasterManager. """ logger.info("Running the MasterManager!") attr_str = self.cluster_info.get_master_config(USER_DATA_FILE_S3) if attr_str is not None: self.attributes = json.loads(attr_str) self.attributes['user_data_file'] = USER_DATA_FILE_S3 # Check if the master is running. Update the self.master_instance object. try: instance_id = self.discover_master() if instance_id is not None: self.master_instance = self.ec2.Instance(instance_id) logger.info("Master instance discovered: %s", self.master_instance.instance_id) # this will retry for a while and then throw an exception if master api server is unreachable self.check_master_api_server() if not self.attributes: # This is needed only for first startup when cluster is created. logger.debug("Populating attributes") self.populate_attributes() logger.debug("Saving master's config into S3") self.save_master_config(USER_DATA_FILE_NEW) logger.info("Master config uploaded to s3") except Exception as e: raise AXPlatformException("Failed to discover master: " + str(e)) while(True): if self.master_instance is not None: self.wait_for_termination() message = "Master instance with id " + \ self.master_instance.instance_id + " terminated. A " + \ "new master instance will be created. This should " + \ "take a few minutes" else: logger.info("Master not running") message = "Master instance not found" + \ "A new master instance will be created. This should " + \ "take a few minutes." self.send_notification(CODE_PLATFORM_ERROR, message) new_master = self.launch_new_master() self.master_instance = self.ec2.Instance(new_master.instance_id) logger.info("New master instance %s running", self.master_instance.instance_id) self.send_notification(CODE_PLATFORM_CRITICAL, "New master " + \ "instance with id {} started".format( self.master_instance.instance_id)) logger.info("Wait for {} minutes before running checks...".format(WAIT_TIME_POST_RESTART_MIN)) time.sleep(WAIT_TIME_POST_RESTART_MIN * const.SECONDS_PER_MINUTE) logger.info("Done waiting. Now back to checks") @retry_unless() def check_master_api_server(self): c = KubernetesApiClient() c.api.read_namespaced_service("default", "kubernetes") def upgrade(self): """ Entry point for master upgrade. Support upgrade of: - Kubernetes versions; - AMI image; - Selected list of kube_env variables. """ logger.info("Starting master upgrade!") ami_name = os.getenv("AX_AWS_IMAGE_NAME") assert ami_name, "Fail to detect AMI name from environ" ami_id = AMI(aws_region=self.region, aws_profile=self.profile).get_ami_id_from_name(ami_name=ami_name) logger.info("Using ami %s for new master", ami_id) s3_data = self.cluster_info.get_master_config(USER_DATA_FILE_S3) if s3_data is None: attr = None else: attr = json.loads(self.cluster_info.get_master_config(USER_DATA_FILE_S3)) instance_id = self.discover_master() terminating = False launching = False if instance_id is None: # This is possible if previous upgrade fails after termination but before new master start. # Simply restart master in this case. # This could also happen when master crashes in the first place and upgrade is started. # We would use old config to start master and rerun upgrade again. logger.info("No running master. S3 attr %s.", USER_DATA_FILE_S3) assert attr is not None, "No master instance and no master config." self.attributes = attr self.attributes['user_data_file'] = USER_DATA_FILE_S3 self.ensure_master_tags() self.save_master_config(USER_DATA_FILE_S3) launching = True else: self.master_instance = self.ec2.Instance(instance_id) logger.info("Running master %s.", instance_id) self.aws_image = ami_id self.instance_profile = AXClusterInstanceProfile(self.cluster_name_id, region_name=self.region, aws_profile=self.profile).get_master_arn() self.populate_attributes() master_tag_updated = self.ensure_master_tags() # TODO: Possible race here. # If upgrade is interrupted after config saving but before master termination, # Next upgrade attempt would assume master is already upgraded. # Manually hack to terminate instance is needed then. if attr != self.attributes or self.user_data_updated() or master_tag_updated: self.save_master_config(USER_DATA_FILE_NEW) terminating = True launching = True if terminating: self.terminate_master() logger.info("Done terminating %s", instance_id) if launching: logger.info("Done launching %s", self.launch_new_master()) def ensure_master_tags(self): """ During upgrade, we need to ensure master has AXClusterNameID,AXCustomerID,AXTier tags (#23) :return: True if we updated master tags """ for tag in self.attributes['master_tags']: if tag["Key"] == "AXTier": # Master has updated tags return False self.attributes['master_tags'] += [ { "Key": "AXCustomerID", "Value": AXCustomerId().get_customer_id() }, { "Key": "AXTier", "Value": "master" }, { "Key": "AXClusterNameID", "Value": self.cluster_name_id }, ] return True
class ClusterResumer(ClusterOperationBase): def __init__(self, cfg): assert isinstance(cfg, ClusterRestartConfig) self._cfg = cfg super(ClusterResumer, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32" self._software_info = SoftwareInfo(info_dict=yaml.load( self._cluster_info.download_cluster_software_info())) def pre_run(self): if self._cluster_info.is_cluster_supported_by_portal(): raise RuntimeError( "Cluster is currently supported by portal. Please login to portal to perform cluster management operations." ) if self._csm.is_running(): logger.info("Cluster is already running.") sys.exit(0) if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"): raise RuntimeError( "Cluster is not successfully installed: Stage2 information missing! Operation aborted." ) self._csm.do_resume() self._persist_cluster_state_if_needed() def post_run(self): self._csm.done_resume() self._persist_cluster_state_if_needed() def run(self): if self._cfg.dry_run: logger.info("DRY RUN: Resuming cluster %s with software info %s", self._name_id, self._software_info.to_dict()) return logger.info("%s\n\nResuming cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) # Main resume cluster routine try: self._master_manager.restart_master() self._recover_auto_scaling_groups() self._wait_for_master() self._ensure_restarter_access() self._wait_for_minions() ensure_manifest_temp_dir() self._start_platform() logger.info("\n\n%sSuccessfully resumed cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) except Exception as e: logger.exception(e) raise RuntimeError(e) finally: self._disallow_restarter_access_if_needed() def _start_platform(self): """ This step brings up Argo platform services :return: """ logger.info("Bringing up Argo platform ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH, software_info=self._software_info) platform.start() platform.stop_monitor() def _wait_for_master(self): """ This step waits for master to be up and running :return: """ count = 0 running_master = None while count < WAIT_FOR_RUNNING_MASTER_RETRY: logger.info( "Waiting for master to be up and running. Trail %s / %s", count, WAIT_FOR_RUNNING_MASTER_RETRY) running_master = self._master_manager.discover_master( state=[EC2InstanceState.Running]) if not running_master: time.sleep(5) else: logger.info("%sMaster %s is running%s", COLOR_GREEN, running_master, COLOR_NORM) break count += 1 if count == WAIT_FOR_RUNNING_MASTER_RETRY: raise RuntimeError( "Timeout waiting for master {} to come up. Please manually check cluster status" .format(running_master)) def _wait_for_minions(self): """ This step waits for all minions to come up and registered in Kubernetes master :return: """ # Get kubernetes access token self._cluster_info.download_kube_config() kube_config = self._cluster_info.get_kube_config_file_path() # Wait for nodes to be ready. # Because we made sure during pause that kubernetes master already knows that all minions are gone, # we don't need to worry about cached minions here logger.info("Wait 120 seconds before Kubernetes master comes up ...") time.sleep(120) kubectl = KubernetesApiClient(config_file=kube_config) logger.info("Waiting for all Kubelets to be ready ...") trail = 0 while True: try: all_kubelets_ready = True nodes = kubectl.api.list_node() logger.info("%s / %s nodes registered", len(nodes.items), self._total_nodes) if len(nodes.items) < self._total_nodes: all_kubelets_ready = False else: for n in nodes.items: kubelet_check = { "KubeletHasSufficientDisk", "KubeletHasSufficientMemory", "KubeletHasNoDiskPressure", "KubeletReady", "RouteCreated" } for cond in n.status.conditions: if cond.reason in kubelet_check: kubelet_check.remove(cond.reason) if kubelet_check: logger.info( "Node %s not ready yet. Remaining Kubelet checkmarks: %s", n.metadata.name, kubelet_check) all_kubelets_ready = False break else: logger.info("Node %s is ready.", n.metadata.name) if all_kubelets_ready: logger.info("All Kubelets are ready") break except Exception as e: if "Max retries exceeded" in str(e): # If master API server is still not ready at this moment, we don't count as a trail trail -= 1 logger.info("Kubernetes API server not ready yet") else: logger.exception("Caught exception when listing nodes: %s", e) trail += 1 if trail > WAIT_FOR_MINION_REG_RETRY: raise RuntimeError( "Timeout waiting for minions to come up. Please manually check cluster status" ) time.sleep(10) def _recover_auto_scaling_groups(self): """ This steps does the following: - fetch the previously restored auto scaling group config. If this config cannot be found, we can assume that all autoscaling groups have correct configurations. This could happen when previous restart failed in the middle but passed this stage already, or the cluster is not even paused - Wait for all instances to be in service :return: """ # Get previously persisted asg status logger.info("Fetching last cluster status ...") cluster_status_raw = self._cluster_info.download_cluster_status_before_pause( ) asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) if cluster_status_raw: logger.info("Found last cluster status, restoring cluster ...") cluster_status = yaml.load(cluster_status_raw) all_asg_statuses = cluster_status["asg_status"] # Restore minions for asg_name in all_asg_statuses.keys(): asg_status = all_asg_statuses[asg_name] min_size = asg_status["min_size"] max_size = asg_status["max_size"] desired = asg_status["desired_capacity"] self._total_nodes += desired logger.info( "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s", asg_name, min_size, max_size, desired) asg_mgr.set_asg_spec(name=asg_name, minsize=min_size, maxsize=max_size, desired=desired) logger.info("Waiting for all auto scaling groups to scale up ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster instances are in service%s", COLOR_GREEN, COLOR_NORM) # Delete previously stored cluster status self._cluster_info.delete_cluster_status_before_pause() else: all_asgs = asg_mgr.get_all_asgs() for asg in all_asgs: self._total_nodes += asg["DesiredCapacity"] logger.info( "Cannot find last cluster status, cluster already resumed with %s nodes", self._total_nodes) def _ensure_restarter_access(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Restarting cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[], new_cidr=[self._cidr], action_name="allow-cluster-manager") def _disallow_restarter_access_if_needed(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Restarting cluster from a not trusted IP (%s). Disallowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[self._cidr], new_cidr=[], action_name="disallow-cluster-manager")
class AXPlatform(object): def __new__(cls, *args, **kwargs): if Cloud().target_cloud_gcp(): from .gke_platform import AXGKEPlatform return super(AXPlatform, cls).__new__(AXGKEPlatform) else: return super(AXPlatform, cls).__new__(cls) def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {} def _load_kube_objects_from_steps(self, steps): """ Extract kube objects from steps in config, and load them into memory :param steps: list :return: """ for object_group in steps: assert isinstance(object_group, AXPlatformObjectGroup) for obj in object_group.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name filename = obj.manifest namespace = obj.namespace if name in self._kube_objects: raise ValueError("Duplicated object name {}".format(name)) kubeobj_conf_path = os.path.join(self._manifest_root, filename) self._kube_objects[name] = KubeObject( config_file=kubeobj_conf_path, kubepoll=self.kube_poll, replacing=None, kube_config=self._kube_config, kube_namespace=namespace) def _generate_replacing(self): # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid # yaml files, and therefore we construct string manually here trusted_cidr = self._cluster_config.get_trusted_cidr() if isinstance(trusted_cidr, list): trusted_cidr_str = "[" for cidr in trusted_cidr: trusted_cidr_str += "\"{}\",".format(str(cidr)) trusted_cidr_str = trusted_cidr_str[:-1] trusted_cidr_str += "]" else: trusted_cidr_str = "[{}]".format(trusted_cidr) axsys_cpu = 0 axsys_mem = 0 daemon_cpu = 0 daemon_mem = 0 for name in self._kube_objects.keys(): cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage axsys_cpu += cpu axsys_mem += mem daemon_cpu += dcpu daemon_mem += dmem # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not # have a memory request, but this is an approximation) daemon_cpu += 100 daemon_mem += 100 logger.info( "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi", axsys_cpu, axsys_mem, daemon_cpu, daemon_mem) axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = str( int(self._cluster_config.get_min_node_count()) - axsys_node_count) axuser_max_count = str( int(self._cluster_config.get_max_node_count()) - axsys_node_count) autoscaler_scan_interval = str( self._cluster_config.get_autoscaler_scan_interval()) usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["cpu"] usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[ self._cluster_config.get_axuser_node_type()]["memory"] scale_down_util_thresh = round( max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001 logger.info("Setting node scale down utilization threshold to %s", scale_down_util_thresh) self._persist_node_resource_rsvp(daemon_cpu, daemon_mem) with open("/kubernetes/cluster/version.txt", "r") as f: cluster_install_version = f.read().strip() # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if not asg_name: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # Prepare minion-manager. spot_instances_option = self._cluster_config.get_spot_instances_option( ) minion_manager_asgs = "" if spot_instances_option == SpotInstanceOption.ALL_SPOT: for asg in asg_manager.get_all_asgs(): minion_manager_asgs = minion_manager_asgs + asg[ "AutoScalingGroupName"] + " " minion_manager_asgs = minion_manager_asgs[:-1] elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT: minion_manager_asgs = asg_manager.get_variable_asg( )["AutoScalingGroupName"] return { "REGISTRY": self._software_info.registry, "REGISTRY_SECRETS": self._software_info.registry_secrets, "NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "AX_CLUSTER_NAME_ID": self._cluster_name_id, "AX_AWS_REGION": self._region, "AX_AWS_ACCOUNT": self._account, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "TRUSTED_CIDR": trusted_cidr_str, "NEW_KUBE_SALT_SHA1": os.getenv("NEW_KUBE_SALT_SHA1") or " ", "NEW_KUBE_SERVER_SHA1": os.getenv("NEW_KUBE_SERVER_SHA1") or " ", "AX_KUBE_VERSION": os.getenv("AX_KUBE_VERSION"), "AX_CLUSTER_INSTALL_VERSION": cluster_install_version, "SANDBOX_ENABLED": str(self._cluster_config.get_sandbox_flag()), "ARGO_LOG_BUCKET_NAME": self._cluster_config.get_support_object_store_name(), "ASG_MIN": axuser_min_count, "ASG_MAX": axuser_max_count, "AUTOSCALER_SCAN_INTERVAL": autoscaler_scan_interval, "SCALE_DOWN_UTIL_THRESH": str(scale_down_util_thresh), "AX_CLUSTER_META_URL_V1": self._bucket.get_object_url_from_key( key=self._cluster_config_path.cluster_metadata()), "ASG_NAME": asg_name, "DNS_SERVER_IP": os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]), "AX_ENABLE_SPOT_INSTANCES": str(spot_instances_option != SpotInstanceOption.NO_SPOT), "AX_SPOT_INSTANCE_ASGS": minion_manager_asgs, } def _persist_node_resource_rsvp(self, user_node_daemon_cpu, user_node_daemon_mem): self._cluster_config.set_user_node_resource_rsvp( cpu=user_node_daemon_cpu, mem=user_node_daemon_mem) self._cluster_config.save_config() def start(self): """ Bring up platform using "platform-start.cfg" configuration from manifest directory :return: """ # Generate kube-objects steps = self._config.steps self._load_kube_objects_from_steps(steps) self._replacing = self._generate_replacing() # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32) # At this moment, we MUST separate first step due to the above dependency assert len(steps) >= 2, "Should have at least 1 step to create axops" self.create_objects(steps[0]) self.create_objects(steps[1]) self.create_objects(steps[2]) # Prepare axops_eip self._set_ext_dns() logger.debug("Replacing ENVs: %s", self._replacing) info_bound = "=======================================================\n" img_namespace = "Image Namespace: {}\n".format( self._software_info.image_namespace) img_version = "Image Version: {}\n".format( self._software_info.image_version) start_info = "\n\n{}{}{}{}{}".format( info_bound, "Platform Up: Bringing up Argo services...\n", img_namespace, img_version, info_bound) logger.info(start_info) # Start rest of the objects for i in range(3, len(steps)): self.create_objects(steps[i]) # update application namespace logger.info("Updating application managers") for app in Applications(client=self.kubectl).list(): logger.info("--- updating {}".format(app)) a = Application(app, client=self.kubectl) a.create(force_recreate=True) logger.info("Done updating application managers") # Upload version information to target cluster self._update_version() logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n", COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name, COLOR_NORM) def stop(self): """ Bring down platform using "platform-stop.cfg" configuration from manifest directory :return: """ # Generate kube-objects (Does not need to generate replacing during platform down) # Stop order should be the reverse of start steps = self._config.steps steps.reverse() self._load_kube_objects_from_steps(steps) info_bound = "=======================================================\n" stop_info = "\n\n{}{}{}".format( info_bound, "Platform Down: Shutting down Argo services...\n", info_bound) logger.info(stop_info) # Bring down objects according to steps for i in range(len(steps)): object_group = steps[i] self.delete_objects(object_group) def stop_monitor(self): self._monitor.stop() def create_objects(self, objects): """ Start kubernetes objects based on records. Wait for all of them. :param objects: AXPlatformObjectGroup """ assert isinstance(objects, AXPlatformObjectGroup) if not self._should_create_group( policy=objects.policy, policy_predicate=objects.policy_predicate, consistency=objects.consistency): logger.debug( "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)", objects.name, objects.policy, objects.policy_predicate, objects.consistency) return logger.info("Create step: %s", objects.name) logger.info("Creating platform objects\n\n%s", self._generate_object_summary(objects.object_set)) pool = ThreadPool(len(objects.object_set)) async_results = {} for obj in objects.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name namespace = obj.namespace async_results[name] = pool.apply_async( self.start_one, args=(name, ), kwds={"namespace": namespace}) pool.close() pool.join() report, failed = self._generate_report(async_results, "Create") logger.info(report) if failed: raise AXPlatformException("Failed to create platform objects.") def _should_create_group(self, policy, policy_predicate, consistency): """ Take AXPlatformObjectGroup policy, predicate and consistency and determine if this group should be created or not :param policy: :param policy_predicate: :param consistency: :return: """ # Since we are not using consistency, we should always create if not # explicitly told not to, i.e. if there is a PrivateRegistryOnly # We are just leaving the interface here that should create or not # need to be decided by policy, policy_predicate and consistency if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \ not self._software_info.registry_is_private(): return False return True def delete_objects(self, objects): """ Stop kubernetes objects based on records. Wait for all of them. :param objects: AXPlatformObjectGroup """ assert isinstance(objects, AXPlatformObjectGroup) if not self._should_delete_group( policy=objects.policy, policy_predicate=objects.policy_predicate): logger.debug( "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)", objects.name, objects.policy, objects.policy_predicate) return logger.info("Delete step: %s", objects.name) logger.info("Deleting platform objects\n\n%s.", self._generate_object_summary(objects.object_set)) pool = ThreadPool(len(objects.object_set)) async_results = {} for obj in objects.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name namespace = obj.namespace async_results[name] = pool.apply_async( self.stop_one, args=(name, ), kwds={"namespace": namespace}) pool.close() pool.join() report, failed = self._generate_report(async_results, "Delete") logger.info(report) if failed: raise AXPlatformException("Failed to create platform objects.") def _should_delete_group(self, policy, policy_predicate): """ Take AXPlatformObjectGroup policy and determine if this group should be deleted or not. Consistency is not needed for deletion :param policy: :param policy_predicate: :return: """ if policy == ObjectGroupPolicy.CreateMany: return True return False def start_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Creating %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] # Update them as there are new updates in replacing in platform start kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } if kube_obj.healthy(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS) ] result["duration"] = str(round(time.time() - start, 2)) return result # Previous platform start might fail, and might result in some componenets created # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing # object and try to create a new one if kube_obj.exists(): logger.warning( "Object %s exists but not healthy. Deleting object for idempotency ...", name) self.stop_one(name, namespace) assert not kube_obj.exists( ), "Kubeobject {} already created but is not healthy. Not Expected".format( name) monitor_info = kube_obj.get_create_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout, waiter) # Call kubectl create kube_obj.create() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details result["code"].append("{:.25s}:{}".format( wait_info["name"], waiter.result)) if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: logger.info("Successfully created %s with code %s.", wait_info["name"], waiter.result) else: result["failed"] = True logger.error( "Failed to create %s in %s with code %s. Events: %s", wait_info["name"], namespace, waiter.result, str(waiter.details)) if not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) result["code"] += del_rst["code"] result["events"] += del_rst["events"] result["duration"] = str(round(time.time() - start, 2)) return result # Poll extra if required (for Petset and Deployments with multiple replicas) if kube_obj.extra_poll: logger.info( "Polling till healthy to make sure rest of components of %s are up and running ...", name) create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults. ObjCreateExtraPollInterval, poll_max_retry=AXPlatformConfigDefaults. ObjCreateExtraPollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst # Poll once to confirm all components from this Kubernetes config file exist, # In case there are objects in this config file cannot be monitored, i.e. svc # without elb. This is really not expected so we don't delete it if not kube_obj.healthy(): logger.error( "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.", name) result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNHEALTHY)) result["failed"] = True result["events"].append( "Object {} created byt is not healthy".format(name)) result["duration"] = str(round(time.time() - start, 2)) if not result["failed"]: logger.info("Successfully created object %s.", name) return result else: # use polling kube_obj.create() create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst @staticmethod def _poll_till_healthy(name, kube_obj, start_time, poll_interval, poll_max_retry, rst): trail = 0 assert isinstance(kube_obj, KubeObject) while True: if not kube_obj.healthy(): trail += 1 if trail > poll_max_retry: logger.error("Failed to create KubeObject %s", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY) ] rst["events"] += [ "Object {} creation timeout. Not healthy".format(name) ] rst["failed"] = True rst["duration"] = str(round(time.time() - start_time, 2)) return rst else: logger.info("Successfully created %s.", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.OK) ] rst["failed"] = False rst["duration"] = str(round(time.time() - start_time, 2)) return rst time.sleep(poll_interval) def stop_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Deleting %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } # Don't delete if object does not exist if not kube_obj.exists(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED) ] result["duration"] = str(round(time.time() - start, 2)) return result monitor_info = kube_obj.get_delete_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout, waiter) # Call kubectl delete kube_obj.delete() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.DELETED)) logger.info("Successfully deleted %s in %s with code %s.", wait_info["name"], name, result["code"]) else: result["failed"] = True result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.UNKNOWN)) logger.error( "Failed to delete %s in %s with code %s. Events: %s", wait_info["name"], name, result["code"], str(waiter.details)) # Poll once to confirm all components from this Kubenetes config file exist # In case there are objects in this config file cannot be monitored, i.e. svc without elb if kube_obj.exists(): logger.error("Object %s deleted but still exists", name) result["failed"] = True result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNKNOWN)) result["events"].append( "Object {} deleted but still exists.".format(name)) result["duration"] = str(round(time.time() - start, 2)) logger.info("Successfully deleted %s.", name) return result else: # use polling kube_obj.delete() return self._poll_till_not_exists( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry, rst=result) @staticmethod def _poll_till_not_exists(name, kube_obj, start_time, poll_interval, poll_max_retry, rst): trail = 0 assert isinstance(kube_obj, KubeObject) while True: if kube_obj.exists(): trail += 1 if trail > poll_max_retry: logger.error("Failed to delete KubeObject %s", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN) ] rst["events"] += [ "Object {} deletion timeout. Please manually check remaining pods" .format(name) ] rst["failed"] = True rst["duration"] = str(round(time.time() - start_time, 2)) return rst else: logger.info("Successfully deleted %s.", name) rst["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED) ] rst["failed"] = False rst["duration"] = str(round(time.time() - start_time, 2)) return rst time.sleep(poll_interval) def _generate_object_summary(self, objects): """ :param objects: list of AXPlatformObject :return: """ report_title = "\n{:25s} | {:110s} | {:20s}\n".format( "NAME", "MANIFEST", "NAMESPACE") report_bar = "{}\n".format("-" * 174) content = "" for obj in objects: assert isinstance(obj, AXPlatformObject) name = obj.name filename = os.path.join(self._manifest_root, obj.manifest) namespace = obj.namespace content += "{:25s} | {:110s} | {:20s}\n".format( name, filename, namespace) return report_title + report_bar + content @staticmethod def _generate_report(results, operation): failed = False report_body = "" warnings = "\n======= WARNING EVENTS =======\n" for name in results.keys(): individual_report = "{:25s} | {:110s} | {:20s}\n" individual_warning = "{name}: {events}\n\n" try: result = results[name].get() if result["failed"]: failed = True code = result["code"][0] for c in result["code"][1:]: code += " / {}".format(c) individual_report = individual_report.format( name, code, result["duration"], 2) if len(result["events"]) > 0: warnings += individual_warning.format( name=name, events=str(result["events"])) except Exception as e: failed = True logger.exception(str(e)) individual_report = individual_report.format( name, "EXCEPTION", "UNKNOWN") warnings += individual_warning.format(name=name, events=str(e)) report_body += individual_report report_head = "\n\nPlatform {} {}. Report:\n".format( operation, "FAILED" if failed else "SUCCESSFULLY") report_title = "\n{:25s} | {:110s} | {:20s}\n".format( "NAME", "STATUS", "TIME (sec)") report_bar = "{}\n".format("-" * 174) return "{}{}{}{}{}{}".format( report_head, report_title, report_bar, report_body, warnings, "==============================\n"), failed def _get_eip_from_config_map(self): try: cmd = [ "kubectl", "get", "configmap", "cluster-dns-name", "-o", "yaml", "--namespace", self.kube_axsys_namespace, "--kubeconfig", self._kube_config ] out = subprocess.check_output(cmd) return [yaml.load(out)["data"]["cluster-external-dns-name"]] except Exception: logger.error("Failed to get cluster dns name from config map.") return None def _get_svc_eip(self, svclabel, namespace): svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace, svclabel) assert len( svc.items) == 1, "Currently services should only have one ingress" rst = [] for ig in svc.items[0].status.load_balancer.ingress: if ig.hostname: rst.append(ig.hostname) if ig.ip: rst.append(ig.ip) return rst def _set_ext_dns(self): axops_eip = self._get_eip_from_config_map() or self._get_svc_eip( svclabel="app=axops", namespace=AXNameSpaces.AXSYS) if not axops_eip: logger.error( "Platform Start Failed: cannot find External IP for AXOPS") raise AXPlatformException("AXOPS elastic IP does not exist") self.cluster_dns_name = axops_eip[0] # Don't change format of this message. Portal parses this line to get cluster IP/DNS. logger.info( "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n", COLOR_GREEN, self.cluster_dns_name, COLOR_NORM) self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name def get_cluster_external_dns(self): if not self.cluster_dns_name: self._set_ext_dns() return self.cluster_dns_name def _set_autoscaling(self): # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg( ) or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException( "Failed to get autoscaling group for cluster {}".format( self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if asg_name is not None: self._replacing["ASG_NAME"] = asg_name else: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group") # TODO (#157) Version should only be uploaded during install and upgrade time def _update_version(self): # Software info we get during install / upgrade does not contain ami id # need to persist it as well self._software_info.ami_id = self._cluster_config.get_ami_id() AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id, self._aws_profile).update(self._software_info.to_dict())
class ClusterPauser(ClusterOperationBase): def __init__(self, cfg): assert isinstance(cfg, ClusterPauseConfig) self._cfg = cfg super(ClusterPauser, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._master_manager = AXMasterManager( cluster_name_id=self._name_id, region=self._cluster_config.get_region(), profile=self._cfg.cloud_profile) self._bootstrap_obj = AXBootstrap( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) self._cidr = str(get_public_ip()) + "/32" def pre_run(self): if self._cluster_info.is_cluster_supported_by_portal(): raise RuntimeError( "Cluster is currently supported by portal. Please login to portal to perform cluster management operations." ) if self._csm.is_paused(): logger.info("Cluster is already paused.") sys.exit(0) # This is for backward compatibility if not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage2"): raise RuntimeError( "Cluster is not successfully installed: Stage2 information missing! Operation aborted." ) self._csm.do_pause() self._persist_cluster_state_if_needed() def run(self): if self._cfg.dry_run: logger.info("DRY RUN: pausing cluster %s", self._name_id) return # Check if cluster's master is paused already. Since terminating master is the very last thing # of pausing cluster, if master is already stopped, cluster has already been successfully paused stopped_master = self._master_manager.discover_master( state=[EC2InstanceState.Stopped]) if stopped_master: logger.info( "\n\n%sMaster %s already stopped. Cluster %s already paused%s\n", COLOR_GREEN, stopped_master, self._name_id, COLOR_NORM) return else: logger.info("\n\n%sPausing cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) # Main pause cluster routine try: self._ensure_pauser_access() ensure_manifest_temp_dir() self._shutdown_platform() self._scale_down_auto_scaling_groups() self._wait_for_deregistering_minions() logger.info("Stopping master ...") self._master_manager.stop_master() logger.info("\n\n%sSuccessfully paused cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) except Exception as e: logger.exception(e) raise RuntimeError(e) finally: self._disallow_pauser_access_if_needed() def post_run(self): self._csm.done_pause() self._persist_cluster_state_if_needed() def _wait_for_deregistering_minions(self): """ This step waits for all minions to be de-registered from Kubernetes master, e.g. `kubectl get nodes` returns no minions besides master :return: """ # Wait for kubernetes master de-register all minions logger.info( "Waiting for Kubernetes master to de-register all existing minions" ) self._cluster_info.download_kube_config() kube_config = self._cluster_info.get_kube_config_file_path() kubectl = KubernetesApiClient(config_file=kube_config) while True: try: nodes = kubectl.api.list_node() node_names = [] # list nodes should only show master now if len(nodes.items) > 1: for n in nodes.items: node_names.append(n.metadata.name) logger.info("Remaining Kubernetes minions: %s", node_names) else: # I don't see it necessary to check if the remaining node is master or not logger.info("%sAll minions de-registered from master%s", COLOR_GREEN, COLOR_NORM) break except Exception as e: logger.warning("Caught exception when listing nodes: %s", e) time.sleep(15) def _scale_down_auto_scaling_groups(self): """ This step: - Persist autoscaling group states to S3, - Scale down all autoscaling groups to zero, - Wait for all minion to be terminated :return: """ logger.info("Discovering autoscaling groups") asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) all_asgs = asg_mgr.get_all_asgs() # Generate cluster status before pause. This is used to recover same amount of nodes # when we want to restart cluster cluster_status = {"asg_status": {}} for asg in all_asgs: cluster_status["asg_status"][asg["AutoScalingGroupName"]] = { "min_size": asg["MinSize"], "max_size": asg["MaxSize"], "desired_capacity": asg["DesiredCapacity"] } self._cluster_info.upload_cluster_status_before_pause( status=yaml.dump(cluster_status)) # Scale down asg logger.info("Scaling down autoscaling groups ...") for asg in all_asgs: asg_name = asg["AutoScalingGroupName"] asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0) # Waiting for nodes to be terminated logger.info("Waiting for all auto scaling groups to scale down ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN, COLOR_NORM) def _shutdown_platform(self): """ This step shuts down platform based on the config and manifest provided :return: """ logger.info("Shutting platform for pausing the cluster ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH) platform.stop() platform.stop_monitor() def _ensure_pauser_access(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[], new_cidr=[self._cidr], action_name="allow-cluster-manager") def _disallow_pauser_access_if_needed(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Pausing cluster from a not trusted IP (%s). Disallowing access.", self._cidr) self._bootstrap_obj.modify_node_security_groups( old_cidr=[self._cidr], new_cidr=[], action_name="disallow-cluster-manager")
def update_cluster_config(self): """ Upgrade the cluster config in S3 such that it has all required fields. """ logger.info("Updating cluster config!") cluster_config = AXClusterConfig(cluster_name_id=self._cluster_name_id, aws_profile=self._profile) cluster_info = AXClusterInfo(cluster_name_id=self._cluster_name_id, aws_profile=self._profile) # Separate axsys / axuser config if needed update_node_config_key_needed = False try: # New cluster config is looking for "max_node_count" for this method and # should throw KeyError if the cluster config in s3 was the old one cluster_config.get_max_node_count() except KeyError: update_node_config_key_needed = True if update_node_config_key_needed: logger.info("Updating node config keys ...") # Parse old raw config directly minion_type = cluster_config._conf["cloud"]["configure"][ "minion_type"] max_count = cluster_config._conf["cloud"]["configure"]["max_count"] min_count = cluster_config._conf["cloud"]["configure"]["min_count"] axsys_count = cluster_config._conf["cloud"]["configure"][ "axsys_nodes"] # Remove all old keys for old_key in [ "minion_type", "max_count", "min_count", "axsys_nodes" ]: cluster_config._conf["cloud"]["configure"].pop(old_key, None) # Setting new keys cluster_config._conf["cloud"]["configure"][ "axsys_node_count"] = axsys_count cluster_config._conf["cloud"]["configure"][ "max_node_count"] = max_count cluster_config._conf["cloud"]["configure"][ "min_node_count"] = min_count # All clusters that needs this upgrade has same node type for axsys and axuser cluster_config._conf["cloud"]["configure"][ "axuser_node_type"] = minion_type cluster_config._conf["cloud"]["configure"][ "axsys_node_type"] = minion_type else: logger.info("Node config keys are already up-to-date") # If cluster type is not set, default it to standard type if cluster_config.get_ax_cluster_type() == None: cluster_config._conf["cloud"]["configure"][ "cluster_type"] = AXClusterType.STANDARD # Check and update Cluster user. Defaults to "customer" if cluster_config.get_ax_cluster_user() is None: cluster_config.set_ax_cluster_user('customer') # Check and update Cluster size. Defaults to "small" if cluster_config.get_ax_cluster_size() is None: max_count = cluster_config.get_max_node_count() if max_count == 5: cluster_size = "small" elif max_count == 10: cluster_size = "medium" elif max_count == 21: cluster_size = "large" elif max_count == 30: cluster_size = "xlarge" else: cluster_size = "small" cluster_config.set_ax_cluster_size(cluster_size) # Check and update AX Volume size. Note that this has to come *AFTER* the cluster_size is set. if cluster_config.get_ax_vol_size() is None: cluster_size = cluster_config.get_ax_cluster_size() if cluster_size in ("small", "medium"): vol_size = 100 elif cluster_size == "large": vol_size = 200 elif cluster_size == "xlarge": vol_size = 400 else: vol_size = 100 cluster_config.set_ax_vol_size(vol_size) # Ensure that we have 3 tiers now cluster_config.set_node_tiers("master/applatix/user") # set new ami id ami_name = os.getenv("AX_AWS_IMAGE_NAME") ami_id = AMI( aws_region=self._region, aws_profile=self._profile).get_ami_id_from_name(ami_name=ami_name) logger.info("Updating cluster config with ami %s", ami_id) cluster_config.set_ami_id(ami_id) cluster_config.save_config()
def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {}
class ClusterUninstaller(ClusterOperationBase): def __init__(self, cfg): assert isinstance(cfg, ClusterUninstallConfig) self._cfg = cfg super(ClusterUninstaller, self).__init__(cluster_name=self._cfg.cluster_name, cluster_id=self._cfg.cluster_id, cloud_profile=self._cfg.cloud_profile, dry_run=self._cfg.dry_run) # This will raise exception if name/id mapping cannot be found self._name_id = self._idobj.get_cluster_name_id() self._cluster_info = AXClusterInfo(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) self._cluster_config = AXClusterConfig( cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) # Initialize node count to 1 as master is not in an auto scaling group self._total_nodes = 1 self._cidr = str(get_public_ip()) + "/32" def pre_run(self): if self._cluster_info.is_cluster_supported_by_portal(): raise RuntimeError( "Cluster is currently supported by portal. Please login to portal to perform cluster management operations." ) # Abort operation if cluster is not successfully installed if not check_cluster_staging( cluster_info_obj=self._cluster_info, stage="stage2") and not self._cfg.force_uninstall: raise RuntimeError( "Cluster is not successfully installed or has already been half deleted. If you really want to uninstall the cluster, please add '--force-uninstall' flag to finish uninstalling cluster. e.g. 'argocluster uninstall --force-uninstall --cluster-name xxx'" ) if not self._csm.is_running() and not self._cfg.force_uninstall: raise RuntimeError( "Cluster is not in Running state. If you really want to uninstall the cluster, please add '--force-uninstall' flag to finish uninstalling cluster. e.g. 'argocluster uninstall --force-uninstall --cluster-name xxx'" ) self._csm.do_uninstall() self._ensure_critical_information() self._persist_cluster_state_if_needed() def post_run(self): return def run(self): if self._cfg.dry_run: logger.info("DRY RUN: Uninstalling cluster %s", self._name_id) return logger.info("%s\n\nUninstalling cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) # Main uninstall cluster routine try: self._check_cluster_before_uninstall() # We only need to keep stage0 information, which is an indication of we still need to # clean up the Kubernetes cluster self._cluster_info.delete_staging_info("stage2") self._cluster_info.delete_staging_info("stage1") self._clean_up_kubernetes_cluster() # As _clean_up_argo_specific_cloud_infrastructure() will clean everything inside bucket # that is related to this cluster, stage0 information is not explicitly deleted here self._clean_up_argo_specific_cloud_infrastructure() logger.info("\n\n%sSuccessfully uninstalled cluster %s%s\n", COLOR_GREEN, self._name_id, COLOR_NORM) except Exception as e: logger.exception(e) raise RuntimeError(e) def _ensure_critical_information(self): """ If not force uninstall, we don't require user to provide a cloud regions / placement and therefore these 2 fields in self._cfg are None. We need to load them from cluster config :return: """ load_from_cluster_config = True if self._cfg.force_uninstall: if self._cfg.cloud_region and self._cfg.cloud_placement: load_from_cluster_config = False elif not check_cluster_staging(cluster_info_obj=self._cluster_info, stage="stage0"): # Fail uninstall when cluster_config does not exist and region/placement # information are not provided raise RuntimeError(""" Cluster Stage 0 information is missing. Cluster is either not installed or it's management records in S3 are broken. If you believe there is still resource leftover, please provide cluster's region/placement information using "--cloud-placement" and "--cloud-region" """) if load_from_cluster_config: self._cfg.cloud_region = self._cluster_config.get_region() self._cfg.cloud_placement = self._cluster_config.get_zone() def _clean_up_argo_specific_cloud_infrastructure(self): """ This step cleans up components in cloud provider that are specifically needed by Argo cluster, including: - Buckets (everything under this cluster's directory) - Server certificates :return: """ logger.info( "Cluster uninstall step: Clean Up Argo-specific Infrastructure") AXClusterBuckets(self._name_id, self._cfg.cloud_profile, self._cfg.cloud_region).delete() # Delete server certificates: This code is deleting the default server certificates created # by public and private elb. Since server certs cannot be tagged, we need to delete them this way. certname = ManagedElb.get_elb_name(self._name_id, "ing-pub") delete_server_certificate(self._cfg.cloud_profile, certname) certname = ManagedElb.get_elb_name(self._name_id, "ing-pri") delete_server_certificate(self._cfg.cloud_profile, certname) def _clean_up_kubernetes_cluster(self): """ This step cleans up Kubernetes if needed. It only touches components in cloud provider that Kubernetes needs, including: - Load Balancers - Instances - Auto scaling groups - launch configurations - Volumes - Security groups - Elastic IPs - VPCs (If this VPC is not shared) :return: """ if not check_cluster_staging( cluster_info_obj=self._cluster_info, stage="stage0") and not self._cfg.force_uninstall: logger.info("Skip clean up Kubernetes cluster") return logger.info("Cluster uninstall step: Clean Up Kubernetes Cluster") if self._cfg.force_uninstall: msg = "{}\n\nIt is possible that cluster S3 bucket is accidentally deleted,\n".format( COLOR_YELLOW) msg += "or S3 bucket information has been altered unintentionally. In this\n" msg += "case, we still try to delete cluster since this is force uninstall.\n" msg += "NOTE: cluster deletion might NOT be successful and still requires\n" msg += "user to clean up left-over resources manually.{}\n".format( COLOR_NORM) logger.warning(msg) env = { "KUBERNETES_PROVIDER": self._cfg.cloud_provider, "KUBE_AWS_ZONE": self._cfg.cloud_placement, "KUBE_AWS_INSTANCE_PREFIX": self._name_id } if self._cfg.cloud_profile: env["AWS_DEFAULT_PROFILE"] = self._cfg.cloud_profile logger.info("\n\n%sCalling kube-down ...%s\n", COLOR_GREEN, COLOR_NORM) AXKubeUpDown(cluster_name_id=self._name_id, env=env, aws_profile=self._cfg.cloud_profile).down() # TODO (#111): revise volume teardown in GCP if Cloud().target_cloud_aws(): delete_tagged_ebs(aws_profile=self._cfg.cloud_profile, tag_key=COMMON_CLOUD_RESOURCE_TAG_KEY, tag_value=self._name_id, region=self._cfg.cloud_region) def _check_cluster_before_uninstall(self): """ This step does sanity check before uninstalling the cluster. :return: """ if not self._cfg.force_uninstall: logger.info("Cluster uninstall step: Sanity Checking") self._cluster_info.download_kube_config() self._ensure_uninstaller_access() self._check_cluster_fixture(kube_config_path=self._cluster_info. get_kube_config_file_path()) else: msg = "{}\n\nForce uninstall: Skip checking cluster. Note that uninstall might fail if there is\n".format( COLOR_YELLOW) msg += "still managed fixture hooked up with cluster. In case cluster uninstall failed due to AWS\n" msg += "resource dependency, please manually clean up those resources and retry uninstall.\n{}".format( COLOR_NORM) logger.warning(msg) @staticmethod def _check_cluster_fixture(kube_config_path): """ This step checks if the cluster has any fixture hooked up. - If there are fixtures hooked up, we abort uninstall, as we don't know how to tear down managed fixtures when we clean up cloud resources - If we don't know whether there is fixture or not, we print out a warning for now and continue :param kube_config_path: path to kube_config :return: """ with open(kube_config_path, "r") as f: config_data = f.read() kube_config = yaml.load(config_data) username = None password = None # All kubeconfig we generate has only 1 cluster server = kube_config["clusters"][0]["cluster"]["server"] for user in kube_config.get("users", []): u = user["user"] if u.get("username", ""): username = u.get("username") password = u.get("password") break if not (username and password): logger.warning( "%sFailed to check managed fixture because Kubernetes credentials cannot be found to access cluster%s", COLOR_YELLOW, COLOR_NORM) return cmd = [ "curl", "--insecure", "--silent", "-u", "{}:{}".format(username, password), "--max-time", "15", "{server}/api/v1/proxy/namespaces/axsys/services/fixturemanager/v1/fixture/instances?deleted=false" .format(server=server) ] try: ret = subprocess.check_output(cmd) except subprocess.CalledProcessError as cpe: msg = "{}\n\nFailed to check cluster fixture state due to {}. Cluster might\n".format( COLOR_YELLOW, cpe) msg += "not be healthy. We will proceed to uninstall cluster with best effort. Note if there are\n" msg += "fixtures that are not cleaned up, uninstall can fail. You can manually\n" msg += "clean them up and uninstall again.\n{}".format(COLOR_NORM) logger.warning(msg) return if ret: try: fixture = json.loads(ret).get("data", []) if fixture: logger.error("Remaining fixtures:\n%s", fixture) raise RuntimeError( "Please cleanup all fixtures before doing uninstall. Or use '--force-uninstall' option to skip this check" ) else: logger.info( "Cluster has no fixture hooked up, proceed to uninstall." ) except ValueError as ve: # In case cluster is not healthy, command output will not be able to loaded # as json. Currently treat it same as "Cannot get fixture data" case logger.warning( "Cannot parse fixture info: %s. Assume cluster has no fixture, proceed to uninstall. Fixture info: %s", ve, ret) else: logger.warning( "Cannot get fixture data. Assume that cluster has no fixture hooked up, proceed to uninstall." ) def _ensure_uninstaller_access(self): if self._cidr not in self._cluster_config.get_trusted_cidr(): logger.info( "Pausing cluster from a not trusted IP (%s). Temporarily allowing access.", self._cidr) bootstrap = AXBootstrap(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cfg.cloud_region) bootstrap.modify_node_security_groups( old_cidr=[], new_cidr=[self._cidr], action_name="allow-cluster-manager")