def platform(self, args): from ax.platform.platform import AXPlatform from ax.meta import AXClusterId from ax.platform_client.env import AXEnv Cloud().set_target_cloud(args.target_cloud) assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster" name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() if args.subcommand == 'start': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile, debug=args.debug).start() elif args.subcommand == 'stop': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile).stop() else: logger.error("%sInvalid command '%s'%s", COLOR_RED, COLOR_NORM) sys.exit(1)
def install_and_run_platform(self): logger.info("Starting platform install") # Install Argo micro-services # Platform install platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=self._cfg.manifest_root, config_file=self._cfg.bootstrap_config) install_platform_failed = False install_platform_failure_message = "" try: platform.start() platform.stop_monitor() except Exception as e: logger.exception(e) install_platform_failed = True install_platform_failure_message = str( e ) + "\nPlease manually check the cluster status and retry installation with same command if the error is transient." if install_platform_failed: raise RuntimeError(install_platform_failure_message) # In case platform is successfully installed, # connect to axops to get initial username and password username, password = self._get_initial_cluster_credentials() logger.info("Done with platform install") return platform.cluster_dns_name, username, password
def kubernetes(self, args): from ax.platform.platform import AXPlatform from ax.meta import AXClusterId from ax.platform_client.env import AXEnv assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster" name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() plat = AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile) if args.subcommand == 'create': plat.start_one(args.object_name) elif args.subcommand == 'delete': plat.stop_one(args.object_name)
def _start_platform(self): """ This step brings up Argo platform services :return: """ logger.info("Bringing up Argo platform ...") platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=self._cfg.manifest_root, config_file=self._cfg.bootstrap_config, software_info=self._cfg.target_software_info) platform.start() platform.stop_monitor()
def _shutdown_platform(self): """ This step shuts down platform based on the config and manifest provided :return: """ logger.info("Shutting Argo platform ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH) platform.stop() platform.stop_monitor()
def _start_platform(self): """ This step brings up Argo platform services :return: """ logger.info("Bringing up Argo platform ...") self._cluster_info.download_platform_manifests_and_config( target_platform_manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, target_platform_config_path=TEMP_PLATFORM_CONFIG_PATH) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=TEMP_PLATFORM_MANIFEST_ROOT, config_file=TEMP_PLATFORM_CONFIG_PATH, software_info=self._software_info) platform.start() platform.stop_monitor()
def _ensure_argo_microservices(self): """ This step won't run if there is "--dry-run" specified. This step assumes there is a running Kubernetes cluster. This step does the following: - ensure ASG count - ensure trusted CIDRs - install Argo software on to the cluster and make sure they are up and running (We don't monitor if the microservice is having a crash loop) - Remove manager CIDR if it is not part of user-specified trusted CIDRs - Upload stage2 information to S3 Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop This step is idempotent :return: cluster_dns_name, username, password """ logger.info("Cluster installation step: Ensure Argo Micro-services") # Reload config in case stage0 and stage1 are skipped self._cluster_config.reload_config() trusted_cidrs = self._cluster_config.get_trusted_cidr() # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to # perform using that object. axbootstrap = AXBootstrap(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # We allow access from everywhere during installation phase, but will remove this access # if user does not specify 0.0.0.0/0 as their trusted CIDR axbootstrap.modify_node_security_groups(old_cidr=[], new_cidr=trusted_cidrs + [EC2IPPermission.AllIP], action_name="allow-creator") if check_cluster_staging(self._cluster_info, "stage2"): # TODO: some duplicated logic here, might need to combine them. logger.info( "Skip ensure Argo micro-services since cluster has already been successfully installed" ) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") return platform.get_cluster_external_dns(), "", "" # Modify ASG axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = int( self._cluster_config.get_min_node_count()) - axsys_node_count axuser_max_count = int( self._cluster_config.get_max_node_count()) - axsys_node_count axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count) cluster_dns, username, password = self.install_and_run_platform() self.post_install() # Remove access from 0.0.0.0/0 if this is not what user specifies if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") return cluster_dns_name, username, password
def _ensure_argo_microservices(self): """ This step won't run if there is "--dry-run" specified. This step assumes there is a running Kubernetes cluster. This step does the following: - ensure ASG count - ensure trusted CIDRs - install Argo software on to the cluster and make sure they are up and running (We don't monitor if the microservice is having a crash loop) - Remove manager CIDR if it is not part of user-specified trusted CIDRs - Upload stage2 information to S3 Stage2 is an indication that the cluster has been successfully installed: Kubernetes is up and running, and all Argo software are up and running. It does not ensure that non of Argo software should be in crash loop This step is idempotent :return: cluster_dns_name, username, password """ logger.info("Cluster installation step: Ensure Argo Micro-services") # Reload config in case stage0 and stage1 are skipped self._cluster_config.reload_config() trusted_cidrs = self._cluster_config.get_trusted_cidr() # Instantiate AXBootstrap object. There are a bunch of stand-alone tasks we need to # perform using that object. axbootstrap = AXBootstrap(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) # We allow access from everywhere during installation phase, but will remove this access # if user does not specify 0.0.0.0/0 as their trusted CIDR axbootstrap.modify_node_security_groups(old_cidr=[], new_cidr=trusted_cidrs + [EC2IPPermission.AllIP], action_name="allow-creator") if check_cluster_staging(self._cluster_info, "stage2"): # TODO: some duplicated logic here, might need to combine them. logger.info( "Skip ensure Argo micro-services since cluster has already been successfully installed" ) platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile) if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") return platform.get_cluster_external_dns(), "", "" # Modify ASG axsys_node_count = int(self._cluster_config.get_asxys_node_count()) axuser_min_count = int( self._cluster_config.get_min_node_count()) - axsys_node_count axuser_max_count = int( self._cluster_config.get_max_node_count()) - axsys_node_count axbootstrap.modify_asg(min=axuser_min_count, max=axuser_max_count) # Install Argo micro-services # Platform install platform = AXPlatform(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, manifest_root=self._cfg.manifest_root, config_file=self._cfg.bootstrap_config) install_platform_failed = False install_platform_failure_message = "" try: platform.start() platform.stop_monitor() except Exception as e: logger.exception(e) install_platform_failed = True install_platform_failure_message = str( e ) + "\nPlease manually check the cluster status and retry installation with same command if the error is transient." if install_platform_failed: raise RuntimeError(install_platform_failure_message) # In case platform is successfully installed, # connect to axops to get initial username and password username, password = self._get_initial_cluster_credentials() # Remove access from 0.0.0.0/0 if this is not what user specifies if EC2IPPermission.AllIP not in trusted_cidrs: axbootstrap.modify_node_security_groups( old_cidr=[EC2IPPermission.AllIP], new_cidr=[], action_name="disallow-creator") # Persist manifests to S3 self._cluster_info.upload_platform_manifests_and_config( platform_manifest_root=self._cfg.manifest_root, platform_config=self._cfg.bootstrap_config) # Finally persist stage2 information self._cluster_info.upload_staging_info(stage="stage2", msg="stage2") logger.info( "Cluster installation step: Ensure Argo Micro-services successfully finished" ) return platform.cluster_dns_name, username, password