Esempio n. 1
0
def destroy_env(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        sh.run(
            f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context"
        )
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl k8s_context: %s", k8s_context)
        try:
            # Here we remove some finalizers that can cause our delete to hang indefinitely
            try:
                sh.run(
                    "kubectl patch crd/trainingjobs.sagemaker.aws.amazon.com "
                    '--patch \'{"metadata":{"finalizers":[]}}\' --type=merge'
                    f" --context {k8s_context}")
            except FailedShellCommand:
                _logger.debug("Ignoring patch failure")

            output_path = _generate_orbit_system_manifest(context=context)
            sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force "
                   f"--ignore-not-found --wait --context {k8s_context}")
            output_paths = _generate_orbit_system_kustomizations(
                context=context, clean_up=True)
            for output_path in output_paths:
                sh.run(
                    f"kubectl delete -k {output_path} --grace-period=0 --force "
                    f"--ignore-not-found --wait --context {k8s_context}")

        except exceptions.FailedShellCommand as ex:
            _logger.debug("Skipping: %s", ex)
            pass  # Let's leave for eksctl, it will destroy everything anyway...
    def deploy_env(env_name: str, manifest_dir: str) -> None:
        docker.login(context=context)
        _logger.debug("DockerHub and ECR Logged in")
        cdk_toolkit.deploy(context=context)
        _logger.debug("CDK Toolkit Stack deployed")
        env.deploy(
            context=context,
            eks_system_masters_roles_changes=changeset.eks_system_masters_roles_changeset if changeset else None,
        )

        _logger.debug("Env Stack deployed")
        eksctl.deploy_env(
            context=context,
            changeset=changeset,
        )
        _logger.debug("EKS Environment Stack deployed")
        kubectl.deploy_env(context=context)
        _logger.debug("Kubernetes Environment components deployed")

        helm.deploy_env(context=context)
        _logger.debug("Helm Charts installed")

        k8s_context = utils.get_k8s_context(context=context)
        kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context)
        ContextSerDe.dump_context_to_ssm(context=context)
        _logger.debug("Updating userpool redirect")
        _update_userpool_client(context=context)
        _update_userpool(context=context)
Esempio n. 3
0
def deploy_team(context: "Context", team_context: "TeamContext") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl context: %s", k8s_context)
        output_path = _generate_team_context(context=context,
                                             team_context=team_context)
        sh.run(
            f"kubectl apply -f {output_path} --context {k8s_context} --wait")
Esempio n. 4
0
def deploy_team(context: "Context", team_context: "TeamContext") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl context: %s", k8s_context)
        output_path = _generate_team_context(context=context, team_context=team_context)
        output_path = _generate_env_manifest(context=context, clean_up=False)
        sh.run(f"kubectl apply -f {output_path} --context {k8s_context} --wait")
        fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=True)
def destroy_team(context: "Context", team_context: "TeamContext") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl k8s_context: %s", k8s_context)
        _logger.debug("Attempting kubectl delete for team %s",
                      team_context.name)
        output_path = _generate_team_context(context=context,
                                             team_context=team_context)
        sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force "
               f"--ignore-not-found --wait --context {k8s_context}")
def destroy_env(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        sh.run(
            f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context"
        )
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl k8s_context: %s", k8s_context)
        output_path = _generate_env_manifest(context=context)
        try:
            sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force "
                   f"--ignore-not-found --wait --context {k8s_context}")
        except exceptions.FailedShellCommand as ex:
            _logger.debug("Skipping: %s", ex)
            pass  # Let's leave for eksctl, it will destroy everything anyway...
def delete_istio_ingress(context: Context) -> None:
    try:
        sh.run(
            f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context"
        )
        k8s_context = get_k8s_context(context=context)
        _logger.debug("k8s_context: %s", k8s_context)

        _logger.info("Deleting istio-ingress")
        sh.run(
            f"kubectl delete ingress -n istio-system --context {k8s_context} --wait istio-ingress"
        )
        time.sleep(30)
        _logger.info("Deleted istio-ingress")
    except:  # noqa: E722
        _logger.exception("Failed to delete istio-ingress")
Esempio n. 8
0
def deploy_env(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("k8s_context: %s", k8s_context)
        if context.networking.data.internet_accessible is False:
            output_path = _generate_efs_driver_manifest(context=context)
            sh.run(f"kubectl apply -k {output_path} --context {k8s_context} --wait")
        else:
            sh.run(f"kubectl apply -k {EFS_DRIVE} --context {k8s_context} --wait")
        output_path = _generate_env_manifest(context=context)
        sh.run(f"kubectl apply -f {output_path} --context {k8s_context} --wait")
        sh.run(f"kubectl set env daemonset aws-node -n kube-system --context {k8s_context} ENABLE_POD_ENI=true")

        fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=False)
Esempio n. 9
0
def destroy_teams(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        sh.run(f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context")
        for team_context in context.teams:
            plugins.PLUGINS_REGISTRIES.destroy_team_plugins(context=context, team_context=team_context)
        k8s_context = get_k8s_context(context=context)
        _logger.debug("kubectl k8s_context: %s", k8s_context)
        _logger.debug("Attempting kubectl delete")
        output_path = _generate_teams_manifest(context=context)
        utils.print_dir(dir=output_path)
        try:
            sh.run(
                f"kubectl delete -f {output_path} --grace-period=0 --force "
                f"--ignore-not-found --wait=false --context {k8s_context}"
            )
        except exceptions.FailedShellCommand as ex:
            _logger.debug("Skipping: %s", ex)
            pass  # Let's leave for eksctl, it will destroy everything anyway...
Esempio n. 10
0
def deploy_env(args: Tuple[str, ...]) -> None:
    _logger.debug("args: %s", args)
    if len(args) == 2:
        env_name: str = args[0]
        skip_images_remote_flag: str = str(args[1])
    else:
        raise ValueError("Unexpected number of values in args")

    context: "Context" = ContextSerDe.load_context_from_ssm(env_name=env_name, type=Context)
    _logger.debug("Context loaded.")
    changeset: Optional["Changeset"] = load_changeset_from_ssm(env_name=env_name)
    _logger.debug("Changeset loaded.")

    docker.login(context=context)
    _logger.debug("DockerHub and ECR Logged in")
    cdk_toolkit.deploy(context=context)
    _logger.debug("CDK Toolkit Stack deployed")
    env.deploy(
        context=context,
        eks_system_masters_roles_changes=changeset.eks_system_masters_roles_changeset if changeset else None,
    )
    _logger.debug("Env Stack deployed")
    deploy_images_remotely(context=context, skip_images=skip_images_remote_flag == "skip-images")
    _logger.debug("Docker Images deployed")
    eksctl.deploy_env(
        context=context,
        changeset=changeset,
    )
    _logger.debug("EKS Environment Stack deployed")
    kubectl.deploy_env(context=context)
    _logger.debug("Kubernetes Environment components deployed")
    helm.deploy_env(context=context)
    _logger.debug("Helm Charts installed")

    k8s_context = utils.get_k8s_context(context=context)
    kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=False)
    ContextSerDe.dump_context_to_ssm(context=context)
Esempio n. 11
0
def deploy_env(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("k8s_context: %s", k8s_context)

        # EFS Driver
        output_path = _generate_efs_driver_manifest(context=context)
        sh.run(
            f"kubectl apply -k {output_path} --context {k8s_context} --wait")

        # FSX Driver
        output_path = _generate_fsx_driver_manifest(context=context)
        sh.run(
            f"kubectl apply -k {output_path} --context {k8s_context} --wait")

        # Orbit Env
        output_path = _generate_env_manifest(context=context)
        sh.run(
            f"kubectl apply -f {output_path} --context {k8s_context} --wait")
        sh.run(
            f"kubectl set env daemonset aws-node -n kube-system --context {k8s_context} ENABLE_POD_ENI=true"
        )
Esempio n. 12
0
def deploy_teams(args: Tuple[str, ...]) -> None:
    _logger.debug("args: %s", args)
    if len(args) == 1:
        env_name: str = args[0]
    else:
        raise ValueError("Unexpected number of values in args")

    context: "Context" = ContextSerDe.load_context_from_ssm(env_name=env_name, type=Context)
    _logger.debug("Context loaded.")
    changeset: Optional["Changeset"] = load_changeset_from_ssm(env_name=env_name)
    _logger.debug("Changeset loaded.")

    if changeset:
        plugins.PLUGINS_REGISTRIES.load_plugins(
            context=context, plugin_changesets=changeset.plugin_changesets, teams_changeset=changeset.teams_changeset
        )
        _logger.debug("Plugins loaded")

    docker.login(context=context)
    _logger.debug("DockerHub and ECR Logged in")
    if changeset and changeset.teams_changeset and changeset.teams_changeset.removed_teams_names:
        kubectl.write_kubeconfig(context=context)
        for team_name in changeset.teams_changeset.removed_teams_names:
            team_context: Optional["TeamContext"] = context.get_team_by_name(name=team_name)
            if team_context is None:
                raise RuntimeError(f"TeamContext {team_name} not found!")
            _logger.debug("Destroying team %s", team_name)
            plugins.PLUGINS_REGISTRIES.destroy_team_plugins(context=context, team_context=team_context)
            _logger.debug("Team Plugins destroyed")
            helm.destroy_team(context=context, team_context=team_context)
            _logger.debug("Team Helm Charts uninstalled")
            kubectl.destroy_team(context=context, team_context=team_context)
            _logger.debug("Kubernetes Team components destroyed")
            eksctl.destroy_team(context=context, team_context=team_context)
            _logger.debug("EKS Team Stack destroyed")
            teams.destroy_team(context=context, team_context=team_context)
            _logger.debug("Team %s destroyed", team_name)
            context.remove_team_by_name(name=team_name)
            ContextSerDe.dump_context_to_ssm(context=context)

    team_names = [t.name for t in context.teams]
    if changeset and changeset.teams_changeset and changeset.teams_changeset.added_teams_names:
        team_names.extend(changeset.teams_changeset.added_teams_names)

    manifest: Optional["Manifest"] = ManifestSerDe.load_manifest_from_ssm(env_name=context.name, type=Manifest)
    if manifest is None:
        raise RuntimeError(f"Manifest {context.name} not found!")
    kubectl.write_kubeconfig(context=context)
    for team_name in team_names:
        team_manifest = manifest.get_team_by_name(name=team_name)
        if team_manifest is None:
            raise RuntimeError(f"TeamManifest {team_name} not found!")
        teams.deploy_team(context=context, manifest=manifest, team_manifest=team_manifest)
        _logger.debug("Team Stacks deployed")
        team_context = context.get_team_by_name(name=team_name)
        if team_context is None:
            raise RuntimeError(f"TeamContext {team_name} not found!")
        eksctl.deploy_team(context=context, team_context=team_context)
        _logger.debug("EKS Team Stack deployed")
        kubectl.deploy_team(context=context, team_context=team_context)
        _logger.debug("Kubernetes Team components deployed")
        helm.deploy_team(context=context, team_context=team_context)
        _logger.debug("Team Helm Charts installed")
        plugins.PLUGINS_REGISTRIES.deploy_team_plugins(
            context=context, team_context=team_context, changes=changeset.plugin_changesets if changeset else []
        )

        team_context.plugins = team_manifest.plugins
        ContextSerDe.dump_context_to_ssm(context=context)
        _logger.debug("Team Plugins deployed")

    k8s_context = utils.get_k8s_context(context=context)
    kubectl.fetch_kubectl_data(context=context, k8s_context=k8s_context, include_teams=True)
    _logger.debug("Teams deployed")
Esempio n. 13
0
def deploy_env(context: "Context") -> None:
    eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster"
    _logger.debug("EKSCTL stack name: %s", eks_stack_name)
    if cfn.does_stack_exist(stack_name=eks_stack_name):
        k8s_context = get_k8s_context(context=context)
        _logger.debug("k8s_context: %s", k8s_context)

        # orbit-system kustomizations
        output_paths = _generate_orbit_system_kustomizations(context=context)
        for path in output_paths:
            sh.run(f"kubectl apply -k {path} --context {k8s_context} --wait")

        # Wait until cert-manager webhook is available
        _confirm_endpoints(name="cert-manager-webhook",
                           namespace="cert-manager",
                           k8s_context=k8s_context)
        _confirm_readiness(name="cert-manager",
                           namespace="cert-manager",
                           type="Deployment",
                           k8s_context=k8s_context)
        _confirm_readiness(name="cert-manager-cainjector",
                           namespace="cert-manager",
                           type="Deployment",
                           k8s_context=k8s_context)

        output_path: Optional[str] = _generate_orbit_system_manifest(
            context=context, clean_up=True)
        sh.run(
            f"kubectl apply -f {output_path} --context {k8s_context} --wait")

        output_path = _generate_orbit_image_replicator_manifest(
            context=context, clean_up=True)
        if output_path is not None:
            sh.run(
                f"kubectl apply -f {output_path} --context {k8s_context} --wait"
            )

        # Commented until we confirm this isn't needed
        # Restart orbit-system deployments and statefulsets to force reload of caches etc
        # sh.run(f"kubectl rollout restart deployments -n orbit-system --context {k8s_context}")

        _confirm_readiness(name="podsetting-operator",
                           namespace="orbit-system",
                           type="deployment",
                           k8s_context=k8s_context)
        _confirm_readiness(name="teamspace-operator",
                           namespace="orbit-system",
                           type="deployment",
                           k8s_context=k8s_context)
        _confirm_readiness(name="userspace-operator",
                           namespace="orbit-system",
                           type="deployment",
                           k8s_context=k8s_context)
        _confirm_endpoints(name="podsetting-pod-webhook",
                           namespace="orbit-system",
                           k8s_context=k8s_context)

        if context.install_image_replicator or not context.networking.data.internet_accessible:
            _confirm_readiness(name="imagereplication-operator",
                               namespace="orbit-system",
                               type="deployment",
                               k8s_context=k8s_context)
            _confirm_endpoints(name="imagereplication-pod-webhook",
                               namespace="orbit-system",
                               k8s_context=k8s_context)
            sh.run(
                "kubectl rollout restart daemonsets -n orbit-system-ssm-daemons "
                f"ssm-agent-installer --context {k8s_context}")

        # kube-system kustomizations
        output_paths = _generate_kube_system_kustomizations(context=context)
        for output_path in output_paths:
            sh.run(
                f"kubectl apply -k {output_path} --context {k8s_context} --wait"
            )

        # kube-system manifests
        output_path = _generate_kube_system_manifest(context=context)
        sh.run(
            f"kubectl apply -f {output_path} --context {k8s_context} --wait")

        # Enable ENIs
        _enable_eni(k8s_context=k8s_context)

        # kubeflow-namespaces
        output_path = _kubeflow_namespaces(context=context)
        sh.run(
            f"kubectl apply -f {output_path} --context {k8s_context} --wait")

        kubeflow.deploy_kubeflow(context=context)

        # env
        output_paths = _generate_orbit_system_env_kustomizations(
            context=context)
        for output_path in output_paths:
            sh.run(
                f"kubectl apply -k {output_path} --context {k8s_context} --wait"
            )

        # Patch Kubeflow
        _logger.debug("Orbit applying KubeFlow patch")
        jupyter_launcher_config_map, patch = _generate_kubeflow_patch(
            context=context)
        sh.run(
            f"kubectl apply -f {jupyter_launcher_config_map} --context {k8s_context} --wait"
        )
        sh.run(
            f'kubectl patch deployment -n kubeflow jupyter-web-app-deployment --patch "{patch}"'
        )
        sh.run(
            "kubectl rollout restart deployment jupyter-web-app-deployment -n kubeflow"
        )

        _apply_deployment_patch_force_env_nodes("istio-system")
        _apply_deployment_patch_force_env_nodes("knative-serving")
        _apply_deployment_patch_force_env_nodes("kube-system")
        _apply_deployment_patch_force_env_nodes("kubeflow")

        # Patch Pods to push into Fargate when deploying in an isolated subnet
        if not context.networking.data.internet_accessible:
            patch = (
                '{"spec":{"template":{"metadata":{"labels":{"orbit/node-type":"fargate"}},'
                '"spec":{"nodeSelector": null}}}}')
            sh.run(
                f"kubectl patch deployment -n istio-system authzadaptor --patch '{patch}'"
            )

            patch = (
                '{"spec":{"template":{"metadata":{"labels":{"orbit/node-type":"fargate"}},'
                '"spec":{"nodeSelector": null, "containers":[{"name":"alb-ingress-controller","args":'
                '["--ingress-class=alb","--cluster-name=$(CLUSTER_NAME)","--aws-vpc-id=VPC_ID"]}]}}}}'
            )
            patch = patch.replace("VPC_ID", cast(str,
                                                 context.networking.vpc_id))
            sh.run(
                f"kubectl patch deployment -n kubeflow alb-ingress-controller --patch '{patch}'"
            )

        # Patch the kubeflow mpi-operator deployment to version lock the images to v0.2.3
        patch = (
            '{"spec":{"template":{"spec":{"containers":[{"name":"mpi-operator","args":["-alsologtostderr",'
            '"--lock-namespace","kubeflow","--kubectl-delivery-image","mpioperator/kubectl-delivery:v0.2.3"],'
            '"image":"mpioperator/mpi-operator:v0.2.3"}]}}}}')
        sh.run(
            f"kubectl patch deployment -n kubeflow mpi-operator --patch '{patch}'"
        )

        # Confirm env Service Endpoints
        _confirm_endpoints(name="landing-page-service",
                           namespace="orbit-system",
                           k8s_context=k8s_context)
Esempio n. 14
0
def gen_kubeflow_config(context: Context, output_path: str,
                        cluster_name: str) -> None:

    os.makedirs(output_path, exist_ok=True)
    _cleanup_output(output_path=output_path)
    if context.account_id is None:
        raise RuntimeError("context.account_id is None!")
    if context.region is None:
        raise RuntimeError("context.region is None!")

    input = os.path.join(CONFIG_PATH, "kfctl_aws.yaml")
    output = os.path.join(output_path, "kfctl_aws.yaml")

    client = boto3_client(service_name="cognito-idp")
    response: Dict[str, Any] = client.describe_user_pool(
        UserPoolId=context.user_pool_id)
    domain: str = response["UserPool"].get("Domain")

    with open(input, "r") as file:
        content: str = file.read()

    content = utils.resolve_parameters(
        content,
        dict(
            certArn=context.networking.frontend.ssl_cert_arn,
            cognitoAppClientId=context.user_pool_client_id,
            cognitoUserPoolID=context.user_pool_id,
            account_id=context.account_id,
            region=context.region,
            env_name=context.name,
            cluster_name=cluster_name,
            cognitoUserPoolDomain=domain,
        ),
    )
    _logger.debug("Kubeflow configuration:\n%s", content)
    with open(output, "w") as file:
        file.write(content)

    k8s_context = get_k8s_context(context=context)

    input = os.path.join(CONFIG_PATH, "apply_kf.sh")
    output = os.path.join(output_path, "apply_kf.sh")

    with open(input, "r") as file:
        content = file.read()

    content = utils.resolve_parameters(
        content,
        dict(cluster_name=cluster_name, k8s_context=k8s_context),
    )
    _logger.debug("Kubeflow script:\n%s", content)
    with open(output, "w") as file:
        file.write(content)

    sh.run(f"chmod a+x  {output}")

    input = os.path.join(CONFIG_PATH, "delete_kf.sh")
    output = os.path.join(output_path, "delete_kf.sh")

    with open(input, "r") as file:
        content = file.read()

    content = utils.resolve_parameters(
        content,
        dict(cluster_name=cluster_name, k8s_context=k8s_context),
    )
    _logger.debug("Kubeflow script:\n%s", content)
    with open(output, "w") as file:
        file.write(content)

    sh.run(f"chmod a+x  {output}")