def destroy_env(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): sh.run( f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context" ) k8s_context = get_k8s_context(context=context) _logger.debug("kubectl k8s_context: %s", k8s_context) try: # Here we remove some finalizers that can cause our delete to hang indefinitely try: sh.run( "kubectl patch crd/trainingjobs.sagemaker.aws.amazon.com " '--patch \'{"metadata":{"finalizers":[]}}\' --type=merge' f" --context {k8s_context}") except FailedShellCommand: _logger.debug("Ignoring patch failure") output_path = _generate_orbit_system_manifest(context=context) sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force " f"--ignore-not-found --wait --context {k8s_context}") output_paths = _generate_orbit_system_kustomizations( context=context, clean_up=True) for output_path in output_paths: sh.run( f"kubectl delete -k {output_path} --grace-period=0 --force " f"--ignore-not-found --wait --context {k8s_context}") except exceptions.FailedShellCommand as ex: _logger.debug("Skipping: %s", ex) pass # Let's leave for eksctl, it will destroy everything anyway...
def deploy_env(env_name: str, manifest_dir: str) -> None: docker.login(context=context) _logger.debug("DockerHub and ECR Logged in") cdk_toolkit.deploy(context=context) _logger.debug("CDK Toolkit Stack deployed") env.deploy( context=context, eks_system_masters_roles_changes=changeset.eks_system_masters_roles_changeset if changeset else None, ) _logger.debug("Env Stack deployed") eksctl.deploy_env( context=context, changeset=changeset, ) _logger.debug("EKS Environment Stack deployed") kubectl.deploy_env(context=context) _logger.debug("Kubernetes Environment components deployed") helm.deploy_env(context=context) _logger.debug("Helm Charts installed") k8s_context = utils.get_k8s_context(context=context) kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context) ContextSerDe.dump_context_to_ssm(context=context) _logger.debug("Updating userpool redirect") _update_userpool_client(context=context) _update_userpool(context=context)
def deploy_team(context: "Context", team_context: "TeamContext") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("kubectl context: %s", k8s_context) output_path = _generate_team_context(context=context, team_context=team_context) sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait")
def deploy_team(context: "Context", team_context: "TeamContext") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("kubectl context: %s", k8s_context) output_path = _generate_team_context(context=context, team_context=team_context) output_path = _generate_env_manifest(context=context, clean_up=False) sh.run(f"kubectl apply -f {output_path} --context {k8s_context} --wait") fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=True)
def destroy_team(context: "Context", team_context: "TeamContext") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("kubectl k8s_context: %s", k8s_context) _logger.debug("Attempting kubectl delete for team %s", team_context.name) output_path = _generate_team_context(context=context, team_context=team_context) sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force " f"--ignore-not-found --wait --context {k8s_context}")
def destroy_env(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): sh.run( f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context" ) k8s_context = get_k8s_context(context=context) _logger.debug("kubectl k8s_context: %s", k8s_context) output_path = _generate_env_manifest(context=context) try: sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force " f"--ignore-not-found --wait --context {k8s_context}") except exceptions.FailedShellCommand as ex: _logger.debug("Skipping: %s", ex) pass # Let's leave for eksctl, it will destroy everything anyway...
def delete_istio_ingress(context: Context) -> None: try: sh.run( f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context" ) k8s_context = get_k8s_context(context=context) _logger.debug("k8s_context: %s", k8s_context) _logger.info("Deleting istio-ingress") sh.run( f"kubectl delete ingress -n istio-system --context {k8s_context} --wait istio-ingress" ) time.sleep(30) _logger.info("Deleted istio-ingress") except: # noqa: E722 _logger.exception("Failed to delete istio-ingress")
def deploy_env(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("k8s_context: %s", k8s_context) if context.networking.data.internet_accessible is False: output_path = _generate_efs_driver_manifest(context=context) sh.run(f"kubectl apply -k {output_path} --context {k8s_context} --wait") else: sh.run(f"kubectl apply -k {EFS_DRIVE} --context {k8s_context} --wait") output_path = _generate_env_manifest(context=context) sh.run(f"kubectl apply -f {output_path} --context {k8s_context} --wait") sh.run(f"kubectl set env daemonset aws-node -n kube-system --context {k8s_context} ENABLE_POD_ENI=true") fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=False)
def destroy_teams(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): sh.run(f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context") for team_context in context.teams: plugins.PLUGINS_REGISTRIES.destroy_team_plugins(context=context, team_context=team_context) k8s_context = get_k8s_context(context=context) _logger.debug("kubectl k8s_context: %s", k8s_context) _logger.debug("Attempting kubectl delete") output_path = _generate_teams_manifest(context=context) utils.print_dir(dir=output_path) try: sh.run( f"kubectl delete -f {output_path} --grace-period=0 --force " f"--ignore-not-found --wait=false --context {k8s_context}" ) except exceptions.FailedShellCommand as ex: _logger.debug("Skipping: %s", ex) pass # Let's leave for eksctl, it will destroy everything anyway...
def deploy_env(args: Tuple[str, ...]) -> None: _logger.debug("args: %s", args) if len(args) == 2: env_name: str = args[0] skip_images_remote_flag: str = str(args[1]) else: raise ValueError("Unexpected number of values in args") context: "Context" = ContextSerDe.load_context_from_ssm(env_name=env_name, type=Context) _logger.debug("Context loaded.") changeset: Optional["Changeset"] = load_changeset_from_ssm(env_name=env_name) _logger.debug("Changeset loaded.") docker.login(context=context) _logger.debug("DockerHub and ECR Logged in") cdk_toolkit.deploy(context=context) _logger.debug("CDK Toolkit Stack deployed") env.deploy( context=context, eks_system_masters_roles_changes=changeset.eks_system_masters_roles_changeset if changeset else None, ) _logger.debug("Env Stack deployed") deploy_images_remotely(context=context, skip_images=skip_images_remote_flag == "skip-images") _logger.debug("Docker Images deployed") eksctl.deploy_env( context=context, changeset=changeset, ) _logger.debug("EKS Environment Stack deployed") kubectl.deploy_env(context=context) _logger.debug("Kubernetes Environment components deployed") helm.deploy_env(context=context) _logger.debug("Helm Charts installed") k8s_context = utils.get_k8s_context(context=context) kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=False) ContextSerDe.dump_context_to_ssm(context=context)
def deploy_env(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("k8s_context: %s", k8s_context) # EFS Driver output_path = _generate_efs_driver_manifest(context=context) sh.run( f"kubectl apply -k {output_path} --context {k8s_context} --wait") # FSX Driver output_path = _generate_fsx_driver_manifest(context=context) sh.run( f"kubectl apply -k {output_path} --context {k8s_context} --wait") # Orbit Env output_path = _generate_env_manifest(context=context) sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait") sh.run( f"kubectl set env daemonset aws-node -n kube-system --context {k8s_context} ENABLE_POD_ENI=true" )
def deploy_teams(args: Tuple[str, ...]) -> None: _logger.debug("args: %s", args) if len(args) == 1: env_name: str = args[0] else: raise ValueError("Unexpected number of values in args") context: "Context" = ContextSerDe.load_context_from_ssm(env_name=env_name, type=Context) _logger.debug("Context loaded.") changeset: Optional["Changeset"] = load_changeset_from_ssm(env_name=env_name) _logger.debug("Changeset loaded.") if changeset: plugins.PLUGINS_REGISTRIES.load_plugins( context=context, plugin_changesets=changeset.plugin_changesets, teams_changeset=changeset.teams_changeset ) _logger.debug("Plugins loaded") docker.login(context=context) _logger.debug("DockerHub and ECR Logged in") if changeset and changeset.teams_changeset and changeset.teams_changeset.removed_teams_names: kubectl.write_kubeconfig(context=context) for team_name in changeset.teams_changeset.removed_teams_names: team_context: Optional["TeamContext"] = context.get_team_by_name(name=team_name) if team_context is None: raise RuntimeError(f"TeamContext {team_name} not found!") _logger.debug("Destroying team %s", team_name) plugins.PLUGINS_REGISTRIES.destroy_team_plugins(context=context, team_context=team_context) _logger.debug("Team Plugins destroyed") helm.destroy_team(context=context, team_context=team_context) _logger.debug("Team Helm Charts uninstalled") kubectl.destroy_team(context=context, team_context=team_context) _logger.debug("Kubernetes Team components destroyed") eksctl.destroy_team(context=context, team_context=team_context) _logger.debug("EKS Team Stack destroyed") teams.destroy_team(context=context, team_context=team_context) _logger.debug("Team %s destroyed", team_name) context.remove_team_by_name(name=team_name) ContextSerDe.dump_context_to_ssm(context=context) team_names = [t.name for t in context.teams] if changeset and changeset.teams_changeset and changeset.teams_changeset.added_teams_names: team_names.extend(changeset.teams_changeset.added_teams_names) manifest: Optional["Manifest"] = ManifestSerDe.load_manifest_from_ssm(env_name=context.name, type=Manifest) if manifest is None: raise RuntimeError(f"Manifest {context.name} not found!") kubectl.write_kubeconfig(context=context) for team_name in team_names: team_manifest = manifest.get_team_by_name(name=team_name) if team_manifest is None: raise RuntimeError(f"TeamManifest {team_name} not found!") teams.deploy_team(context=context, manifest=manifest, team_manifest=team_manifest) _logger.debug("Team Stacks deployed") team_context = context.get_team_by_name(name=team_name) if team_context is None: raise RuntimeError(f"TeamContext {team_name} not found!") eksctl.deploy_team(context=context, team_context=team_context) _logger.debug("EKS Team Stack deployed") kubectl.deploy_team(context=context, team_context=team_context) _logger.debug("Kubernetes Team components deployed") helm.deploy_team(context=context, team_context=team_context) _logger.debug("Team Helm Charts installed") plugins.PLUGINS_REGISTRIES.deploy_team_plugins( context=context, team_context=team_context, changes=changeset.plugin_changesets if changeset else [] ) team_context.plugins = team_manifest.plugins ContextSerDe.dump_context_to_ssm(context=context) _logger.debug("Team Plugins deployed") k8s_context = utils.get_k8s_context(context=context) kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=True) _logger.debug("Teams deployed")
def deploy_env(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): k8s_context = get_k8s_context(context=context) _logger.debug("k8s_context: %s", k8s_context) # orbit-system kustomizations output_paths = _generate_orbit_system_kustomizations(context=context) for path in output_paths: sh.run(f"kubectl apply -k {path} --context {k8s_context} --wait") # Wait until cert-manager webhook is available _confirm_endpoints(name="cert-manager-webhook", namespace="cert-manager", k8s_context=k8s_context) _confirm_readiness(name="cert-manager", namespace="cert-manager", type="Deployment", k8s_context=k8s_context) _confirm_readiness(name="cert-manager-cainjector", namespace="cert-manager", type="Deployment", k8s_context=k8s_context) output_path: Optional[str] = _generate_orbit_system_manifest( context=context, clean_up=True) sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait") output_path = _generate_orbit_image_replicator_manifest( context=context, clean_up=True) if output_path is not None: sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait" ) # Commented until we confirm this isn't needed # Restart orbit-system deployments and statefulsets to force reload of caches etc # sh.run(f"kubectl rollout restart deployments -n orbit-system --context {k8s_context}") _confirm_readiness(name="podsetting-operator", namespace="orbit-system", type="deployment", k8s_context=k8s_context) _confirm_readiness(name="teamspace-operator", namespace="orbit-system", type="deployment", k8s_context=k8s_context) _confirm_readiness(name="userspace-operator", namespace="orbit-system", type="deployment", k8s_context=k8s_context) _confirm_endpoints(name="podsetting-pod-webhook", namespace="orbit-system", k8s_context=k8s_context) if context.install_image_replicator or not context.networking.data.internet_accessible: _confirm_readiness(name="imagereplication-operator", namespace="orbit-system", type="deployment", k8s_context=k8s_context) _confirm_endpoints(name="imagereplication-pod-webhook", namespace="orbit-system", k8s_context=k8s_context) sh.run( "kubectl rollout restart daemonsets -n orbit-system-ssm-daemons " f"ssm-agent-installer --context {k8s_context}") # kube-system kustomizations output_paths = _generate_kube_system_kustomizations(context=context) for output_path in output_paths: sh.run( f"kubectl apply -k {output_path} --context {k8s_context} --wait" ) # kube-system manifests output_path = _generate_kube_system_manifest(context=context) sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait") # Enable ENIs _enable_eni(k8s_context=k8s_context) # kubeflow-namespaces output_path = _kubeflow_namespaces(context=context) sh.run( f"kubectl apply -f {output_path} --context {k8s_context} --wait") kubeflow.deploy_kubeflow(context=context) # env output_paths = _generate_orbit_system_env_kustomizations( context=context) for output_path in output_paths: sh.run( f"kubectl apply -k {output_path} --context {k8s_context} --wait" ) # Patch Kubeflow _logger.debug("Orbit applying KubeFlow patch") jupyter_launcher_config_map, patch = _generate_kubeflow_patch( context=context) sh.run( f"kubectl apply -f {jupyter_launcher_config_map} --context {k8s_context} --wait" ) sh.run( f'kubectl patch deployment -n kubeflow jupyter-web-app-deployment --patch "{patch}"' ) sh.run( "kubectl rollout restart deployment jupyter-web-app-deployment -n kubeflow" ) _apply_deployment_patch_force_env_nodes("istio-system") _apply_deployment_patch_force_env_nodes("knative-serving") _apply_deployment_patch_force_env_nodes("kube-system") _apply_deployment_patch_force_env_nodes("kubeflow") # Patch Pods to push into Fargate when deploying in an isolated subnet if not context.networking.data.internet_accessible: patch = ( '{"spec":{"template":{"metadata":{"labels":{"orbit/node-type":"fargate"}},' '"spec":{"nodeSelector": null}}}}') sh.run( f"kubectl patch deployment -n istio-system authzadaptor --patch '{patch}'" ) patch = ( '{"spec":{"template":{"metadata":{"labels":{"orbit/node-type":"fargate"}},' '"spec":{"nodeSelector": null, "containers":[{"name":"alb-ingress-controller","args":' '["--ingress-class=alb","--cluster-name=$(CLUSTER_NAME)","--aws-vpc-id=VPC_ID"]}]}}}}' ) patch = patch.replace("VPC_ID", cast(str, context.networking.vpc_id)) sh.run( f"kubectl patch deployment -n kubeflow alb-ingress-controller --patch '{patch}'" ) # Patch the kubeflow mpi-operator deployment to version lock the images to v0.2.3 patch = ( '{"spec":{"template":{"spec":{"containers":[{"name":"mpi-operator","args":["-alsologtostderr",' '"--lock-namespace","kubeflow","--kubectl-delivery-image","mpioperator/kubectl-delivery:v0.2.3"],' '"image":"mpioperator/mpi-operator:v0.2.3"}]}}}}') sh.run( f"kubectl patch deployment -n kubeflow mpi-operator --patch '{patch}'" ) # Confirm env Service Endpoints _confirm_endpoints(name="landing-page-service", namespace="orbit-system", k8s_context=k8s_context)
def gen_kubeflow_config(context: Context, output_path: str, cluster_name: str) -> None: os.makedirs(output_path, exist_ok=True) _cleanup_output(output_path=output_path) if context.account_id is None: raise RuntimeError("context.account_id is None!") if context.region is None: raise RuntimeError("context.region is None!") input = os.path.join(CONFIG_PATH, "kfctl_aws.yaml") output = os.path.join(output_path, "kfctl_aws.yaml") client = boto3_client(service_name="cognito-idp") response: Dict[str, Any] = client.describe_user_pool( UserPoolId=context.user_pool_id) domain: str = response["UserPool"].get("Domain") with open(input, "r") as file: content: str = file.read() content = utils.resolve_parameters( content, dict( certArn=context.networking.frontend.ssl_cert_arn, cognitoAppClientId=context.user_pool_client_id, cognitoUserPoolID=context.user_pool_id, account_id=context.account_id, region=context.region, env_name=context.name, cluster_name=cluster_name, cognitoUserPoolDomain=domain, ), ) _logger.debug("Kubeflow configuration:\n%s", content) with open(output, "w") as file: file.write(content) k8s_context = get_k8s_context(context=context) input = os.path.join(CONFIG_PATH, "apply_kf.sh") output = os.path.join(output_path, "apply_kf.sh") with open(input, "r") as file: content = file.read() content = utils.resolve_parameters( content, dict(cluster_name=cluster_name, k8s_context=k8s_context), ) _logger.debug("Kubeflow script:\n%s", content) with open(output, "w") as file: file.write(content) sh.run(f"chmod a+x {output}") input = os.path.join(CONFIG_PATH, "delete_kf.sh") output = os.path.join(output_path, "delete_kf.sh") with open(input, "r") as file: content = file.read() content = utils.resolve_parameters( content, dict(cluster_name=cluster_name, k8s_context=k8s_context), ) _logger.debug("Kubeflow script:\n%s", content) with open(output, "w") as file: file.write(content) sh.run(f"chmod a+x {output}")