Exemple #1
0
def guided_install(config, dns_provider, dns_auto_provision, disable_prompt=False):
    # 01 Verify configuration file exists
    verify_configuration_file_exists()

    # 02 Check terraform
    check_terraform()

    # 03 Check Environment Variables
    check_cloud_credentials(config)

    # 04 Check that oauth settings are set
    if not disable_prompt:
        input(
            'Ensure that oauth settings are in configuration [Press "Enter" to continue]'
        )

    # 05 Create terraform backend remote state bucket
    with change_directory("terraform-state"):
        run(["terraform", "init"])
        run(["terraform", "apply", "-auto-approve"])

    # 06 Create qhub initial state (up to nginx-ingress)
    with change_directory("infrastructure"):
        run(["terraform", "init"])
        run(
            [
                "terraform",
                "apply",
                "-auto-approve",
                "-target=module.kubernetes",
                "-target=module.kubernetes-initialization",
                "-target=module.kubernetes-ingress",
            ]
        )
        cmd_output = check_output(["terraform", "output", "--json"])
        # This is a bit ugly, but the issue we have at the moment is being unable
        # to parse cmd_output as json on Github Actions.
        ip_matches = re.findall(rb'"ip": "(?!string)(.*)"', cmd_output)
        if ip_matches:
            ip = ip_matches[0].decode()
        else:
            raise ValueError(f"IP Address not found in: {cmd_output}")
    # 07 Update DNS to point to qhub deployment
    if dns_auto_provision and dns_provider == "cloudflare":
        record_name, zone_name = (
            config["domain"].split(".")[:-2],
            config["domain"].split(".")[-2:],
        )
        record_name = f'jupyter.{".".join(record_name)}'
        zone_name = ".".join(zone_name)
        if config["provider"] in {"do", "gcp"}:
            update_record(zone_name, record_name, "A", ip)
    else:
        input(
            f'Take IP Address {ip} and update DNS to point to "jupyter.{config["domain"]}" [Press Enter when Complete]'
        )

    # 08 Full deploy QHub
    with change_directory("infrastructure"):
        run(["terraform", "apply", "-auto-approve"])
Exemple #2
0
def destroy_configuration(config):
    logger.info(
        """Removing all infrastructure, your local files will still remain, \n
    you can use 'qhub deploy' to re - install infrastructure using same config file"""
    )

    with timer(logger, "destroying QHub"):
        # 01 Verify configuration file exists
        verify_configuration_file_exists()

        # 02 Check terraform
        check_terraform()

        # 03 Check Environment Variables
        check_cloud_credentials(config)

        # 04 Remove all infrastructure
        with change_directory("infrastructure"):
            run(["terraform", "destroy", "-auto-approve"])

        # 06 Remove terraform backend remote state bucket
        with change_directory("terraform-state"):
            run(["terraform", "destroy", "-auto-approve"])
Exemple #3
0
def guided_install(
    config,
    dns_provider,
    dns_auto_provision,
    disable_prompt=False,
    skip_remote_state_provision=False,
    full_only=False,
):
    # 01 Check Environment Variables
    check_cloud_credentials(config)
    # Check that secrets required for terraform
    # variables are set as required
    check_secrets(config)

    # 02 Create terraform backend remote state bucket
    # backwards compatible with `qhub-config.yaml` which
    # don't have `terraform_state` key
    if (
        (not skip_remote_state_provision)
        and (config.get("terraform_state", {}).get("type", "") == "remote")
        and (config.get("provider") != "local")
    ):
        terraform_state_sync(config)

    # 3 kubernetes-alpha provider requires that kubernetes be
    # provisionioned before any "kubernetes_manifests" resources
    logger.info("Running terraform init")
    terraform.init(directory="infrastructure")

    if not full_only:
        targets = [
            "module.kubernetes",
            "module.kubernetes-initialization",
        ]

        logger.info(f"Running Terraform Stage: {targets}")
        terraform.apply(
            directory="infrastructure",
            targets=targets,
        )

        # 04 Create qhub initial state (up to nginx-ingress)
        targets = ["module.kubernetes-ingress"]
        logger.info(f"Running Terraform Stage: {targets}")
        terraform.apply(
            directory="infrastructure",
            targets=targets,
        )

        cmd_output = terraform.output(directory="infrastructure")
        # This is a bit ugly, but the issue we have at the moment is being unable
        # to parse cmd_output as json on Github Actions.
        ip_matches = re.findall(r'"ip": "(?!string)(.+)"', cmd_output)
        hostname_matches = re.findall(r'"hostname": "(?!string)(.+)"', cmd_output)
        if ip_matches:
            ip_or_hostname = ip_matches[0]
        elif hostname_matches:
            ip_or_hostname = hostname_matches[0]
        else:
            raise ValueError(f"IP Address not found in: {cmd_output}")

        # 05 Update DNS to point to qhub deployment
        if dns_auto_provision and dns_provider == "cloudflare":
            record_name, zone_name = (
                config["domain"].split(".")[:-2],
                config["domain"].split(".")[-2:],
            )
            record_name = ".".join(record_name)
            zone_name = ".".join(zone_name)
            if config["provider"] in {"do", "gcp", "azure"}:
                update_record(zone_name, record_name, "A", ip_or_hostname)
                if config.get("clearml", {}).get("enabled"):
                    add_clearml_dns(zone_name, record_name, "A", ip_or_hostname)
            elif config["provider"] == "aws":
                update_record(zone_name, record_name, "CNAME", ip_or_hostname)
                if config.get("clearml", {}).get("enabled"):
                    add_clearml_dns(zone_name, record_name, "CNAME", ip_or_hostname)
            else:
                logger.info(
                    f"Couldn't update the DNS record for cloud provider: {config['provider']}"
                )
        elif not disable_prompt:
            input(
                f"Take IP Address {ip_or_hostname} and update DNS to point to "
                f'"{config["domain"]}" [Press Enter when Complete]'
            )

        # Now Keycloak Helm chart (External Docker Registry before that if we need one)
        targets = ["module.external-container-reg", "module.kubernetes-keycloak-helm"]
        logger.info(f"Running Terraform Stage: {targets}")
        terraform.apply(
            directory="infrastructure",
            targets=targets,
        )

        # Now Keycloak realm and config
        targets = ["module.kubernetes-keycloak-config"]
        logger.info(f"Running Terraform Stage: {targets}")
        terraform.apply(
            directory="infrastructure",
            targets=targets,
        )

    # Full deploy QHub
    logger.info("Running Terraform Stage: FULL")
    terraform.apply(directory="infrastructure")
Exemple #4
0
def destroy_configuration(config,
                          skip_remote_state_provision=False,
                          full_only=False):
    logger.info(
        """Removing all infrastructure, your local files will still remain,
    you can use 'qhub deploy' to re-install infrastructure using same config file\n"""
    )

    with timer(logger, "destroying QHub"):
        # 01 Check Environment Variables
        check_cloud_credentials(config)

        # 02 Remove all infrastructure
        terraform.init(directory="infrastructure")
        terraform.refresh(directory="infrastructure")

        if not full_only:
            stages = (
                {
                    "name":
                    "General cluster software",
                    "targets": [
                        "module.kubernetes-nfs-mount",
                        "module.kubernetes-nfs-server",
                        "module.kubernetes-nfs-mount",
                        "module.kubernetes-conda-store-server",
                        "module.kubernetes-conda-store-mount",
                        "module.kubernetes-autoscaling",
                        "module.qhub",
                        "module.prefect",
                        "module.monitoring",
                        "module.clearml",
                        "module.forwardauth",
                        "random_password.jupyterhub-jhsecret",
                        "random_password.forwardauth-jhsecret",
                        "kubernetes_secret.qhub_yaml_secret",
                    ] + [
                        f"module.{helmext['name']}-extension"
                        for helmext in config.get("helm_extensions", [])
                    ] + [
                        f"module.ext-{ext['name']}"
                        for ext in config.get("extensions", [])
                    ],
                },
                {
                    "name":
                    "Keycloak Config",
                    "targets": [
                        "module.kubernetes-keycloak-config",
                        "random_password.keycloak-qhub-bot-password",
                    ],
                },
                {
                    "name": "Keycloak Helm installation",
                    "targets": ["module.kubernetes-keycloak-helm"],
                },
                {
                    "name": "Kubernetes Ingress",
                    "targets": ["module.kubernetes-ingress"],
                },
                {
                    "name":
                    "Kubernetes Cluster",
                    "targets": [
                        "module.kubernetes",
                        "module.kubernetes-initialization",
                    ],
                },
                {
                    "name":
                    "Cloud Infrastructure",
                    "targets": [
                        "module.registry-jupyterhub",  # GCP
                        "module.efs",  # AWS
                        "module.registry-jupyterlab",  # AWS
                        "module.network",  # AWS
                        "module.accounting",  # AWS
                        "module.registry",  # Azure
                    ],
                },
            )

            for stageinfo in stages:
                logger.info(
                    f"Running Terraform Stage: {stageinfo['name']} {stageinfo['targets']}"
                )
                terraform.destroy(directory="infrastructure",
                                  targets=stageinfo["targets"])

        else:
            logger.info("Running Terraform Stage: FULL")
            terraform.destroy(directory="infrastructure")

        # 03 Remove terraform backend remote state bucket
        # backwards compatible with `qhub-config.yaml` which
        # don't have `terraform_state` key
        if ((not skip_remote_state_provision) and
            (config.get("terraform_state", {}).get("type", "") == "remote")
                and (config.get("provider") != "local")):
            terraform_state_sync(config)
            terraform.destroy(directory="terraform-state")
Exemple #5
0
def force_destroy_configuration(config):
    logging.info(
        """FORCE Removing all infrastructure (not using terraform).""")

    with timer(logging, "destroying QHub"):
        # 01 Check we have cloud details we need
        check_cloud_credentials(config)

        if config.get("provider", "") != "aws":
            raise ValueError("force-destroy currently only available for AWS")

        project_name = config.get("project_name", "").strip()

        if project_name == "":
            raise ValueError("project_name cannot be blank")

        if "amazon_web_services" not in config:
            raise ValueError(
                "amazon_web_services section must exist in qhub-config.yaml")

        region = config["amazon_web_services"].get("region", "").strip()

        if region == "":
            raise ValueError(
                "amazon_web_services.region must exist in qhub-config.yaml")

        logging.info(f"Remove AWS project {project_name} in region {region}")

        env = config.get("namespace", "dev").strip()

        # 02 Remove all infrastructure
        try:
            import boto3
        except ImportError:
            raise ValueError(
                "Please ensure boto3 package is installed using: pip install boto3==1.17.98"
            )

        restag = boto3.client("resourcegroupstaggingapi", region_name=region)

        filter_params = dict(
            TagFilters=[
                {
                    "Key": "Owner",
                    "Values": [
                        "terraform",
                        "terraform-state",
                    ],
                },
                {
                    "Key": "Environment",
                    "Values": [
                        env,
                    ],
                },
                {
                    "Key": "Project",
                    "Values": [
                        project_name,
                    ],
                },
            ],
            ResourcesPerPage=50,
        )

        resources = []

        response = restag.get_resources(**filter_params)

        resources.extend(response["ResourceTagMappingList"])

        while "PaginationToken" in response and response["PaginationToken"]:
            token = response["PaginationToken"]
            response = restag.get_resources(**filter_params,
                                            PaginationToken=token)
            resources.extend(response["ResourceTagMappingList"])

        # Load Balancer and other K8s-generated resources will need to be queried separately:

        filter_params = dict(
            TagFilters=[{
                "Key": f"kubernetes.io/cluster/{project_name}-{env}",
                "Values": [
                    "owned",
                ],
            }],
            ResourcesPerPage=50,
        )

        response = restag.get_resources(**filter_params)
        resources.extend(response["ResourceTagMappingList"])

        # IAM

        iam = boto3.resource("iam")
        for suffix in ("eks-cluster-role", "eks-node-group-role"):

            try:
                role = iam.Role(f"{project_name}-{env}-{suffix}")

                if role.tags is not None:

                    tags_dict = dict([(t["Key"], t.get("Value", ""))
                                      for t in role.tags])

                    if (tags_dict.get("Owner", "") == "terraform"
                            and tags_dict.get("Environment", "") == env
                            and tags_dict.get("Project", "") == project_name):
                        resources.append({"ResourceARN": role.arn})

            except iam.meta.client.exceptions.NoSuchEntityException:
                pass

        # Summarize resources

        type_groups = {}
        for r in resources:
            de_arned = parse_arn(r["ResourceARN"])
            t = f"{de_arned['service']}-{de_arned['resource_type']}"
            type_groups.setdefault(t, []).append(de_arned)
            logging.info(r["ResourceARN"])

        logging.info([(k, len(v)) for k, v in type_groups.items()])

        # Order
        priority_types = (
            "eks-nodegroup",
            "eks-cluster",
            "elasticloadbalancing-loadbalancer",
            "ec2-internet-gateway",
            "ec2-route-table",
            "elasticfilesystem-file-system",
            "ec2-subnet",
            "ec2-security-group",
            "ec2-vpc",
            "ecr-repository",
            "dynamodb-table",
            "s3-None",
            "resource-groups-group",
            "iam-role",
        )

        for pt in priority_types:
            logging.info(f"Inspect {pt}")
            for r in type_groups.get(pt, []):
                if pt == "eks-nodegroup":
                    nodegroup_resource = r["resource"].split("/")

                    cluster_name = nodegroup_resource[0]
                    nodegroup_name = nodegroup_resource[1]

                    logging.info(
                        f"Delete {nodegroup_name} on cluster {cluster_name}")

                    client = boto3.client("eks", region_name=region)
                    client.delete_nodegroup(clusterName=cluster_name,
                                            nodegroupName=nodegroup_name)

                elif pt == "eks-cluster":
                    logging.info(f"Delete EKS cluster {r['resource']}")

                    client = boto3.client("eks", region_name=region)

                    response = client.list_nodegroups(
                        clusterName=r["resource"])
                    while len(response["nodegroups"]) > 0:
                        logging.info("Nodegroups still present, sleep 10")
                        time.sleep(10)
                        response = client.list_nodegroups(
                            clusterName=r["resource"])

                    client.delete_cluster(name=r["resource"])

                elif pt == "elasticloadbalancing-loadbalancer":
                    client = boto3.client("elb", region_name=region)

                    logging.info(f"Inspect Load balancer {r['resource']}")

                    logging.info(f"Delete Load balancer {r['resource']}")
                    response = client.delete_load_balancer(
                        LoadBalancerName=r["resource"])

                elif pt == "ec2-route-table":
                    logging.info(f"Inspect route table {r['resource']}")
                    ec2 = boto3.resource("ec2", region_name=region)
                    route_table = ec2.RouteTable(r["resource"])

                    for assoc in route_table.associations:
                        logging.info(f"Delete route table assoc {assoc.id}")
                        assoc.delete()

                    time.sleep(10)

                    logging.info(f"Delete route table {r['resource']}")
                    route_table.delete()

                elif pt == "ec2-subnet":
                    logging.info(f"Inspect subnet {r['resource']}")
                    ec2 = boto3.resource("ec2", region_name=region)
                    subnet = ec2.Subnet(r["resource"])

                    for ni in subnet.network_interfaces.all():
                        ni.load()
                        # But can only detach if attached...
                        if ni.attachment:
                            ni.detach(DryRun=False, Force=True)
                            ni.delete()

                    logging.info(f"Delete subnet {r['resource']}")
                    subnet.delete(DryRun=False)

                elif pt == "ec2-security-group":
                    logging.info(f"Inspect security group {r['resource']}")
                    ec2 = boto3.resource("ec2", region_name=region)
                    security_group = ec2.SecurityGroup(r["resource"])

                    for ipperms in security_group.ip_permissions_egress:
                        security_group.revoke_egress(DryRun=False,
                                                     IpPermissions=[ipperms])

                    for ipperms in security_group.ip_permissions:
                        security_group.revoke_ingress(DryRun=False,
                                                      IpPermissions=[ipperms])

                    logging.info(f"Delete security group {r['resource']}")
                    security_group.delete(DryRun=False)

                elif pt == "ec2-internet-gateway":
                    logging.info(f"Inspect internet gateway {r['resource']}")

                    ec2 = boto3.resource("ec2", region_name=region)
                    internet_gateway = ec2.InternetGateway(r["resource"])

                    for attach in internet_gateway.attachments:
                        logging.info(
                            f"Inspect IG attachment {attach['VpcId']}")
                        if attach.get("State", "") == "available":
                            logging.info(f"Detach from VPC {attach['VpcId']}")
                            internet_gateway.detach_from_vpc(
                                VpcId=attach["VpcId"])

                    time.sleep(10)

                    logging.info(f"Delete internet gateway {r['resource']}")
                    internet_gateway.delete(DryRun=False)

                elif pt == "elasticfilesystem-file-system":
                    client = boto3.client("efs", region_name=region)

                    logging.info(f"Delete efs {r['resource']}")

                    mts = client.describe_mount_targets(
                        FileSystemId=r["resource"])

                    for mt in mts["MountTargets"]:
                        client.delete_mount_target(
                            MountTargetId=mt["MountTargetId"])

                    response = client.delete_file_system(
                        FileSystemId=r["resource"])

                    ## Should wait until this returns botocore.errorfactory.FileSystemNotFound:
                    # response = client.describe_file_systems(
                    #    FileSystemId=r['resource']
                    # )

                elif pt == "ec2-vpc":
                    logging.info(f"Inspect VPC {r['resource']}")

                    ec2 = boto3.resource("ec2", region_name=region)

                    vpc = ec2.Vpc(r["resource"])

                    # for cidr_assoc in vpc.cidr_block_association_set:
                    #    logging.info(cidr_assoc)
                    #    r = vpc.disassociate_subnet_cidr_block(
                    #        AssociationId=cidr_assoc['AssociationId']
                    #    )
                    #    logging.info(r)

                    logging.info(f"Delete VPC {r['resource']}")
                    vpc.delete()

                elif pt == "ecr-repository":
                    logging.info(f"Inspect ECR {r['resource']}")
                    client = boto3.client("ecr", region_name=region)

                    logging.info(
                        f"Delete ecr {r['account']} / {r['resource']}")

                    response = response = client.delete_repository(
                        registryId=r["account"],
                        repositoryName=r["resource"],
                        force=True,
                    )

                elif pt == "s3-None":
                    logging.info(f"Inspect S3 {r['resource']}")
                    s3 = boto3.resource("s3", region_name=region)

                    logging.info(f"Delete s3 {r['resource']}")

                    bucket = s3.Bucket(r["resource"])

                    r = bucket.objects.all().delete()

                    r = bucket.object_versions.delete()

                    response = bucket.delete()

                elif pt == "dynamodb-table":
                    logging.info(f"Inspect DynamoDB {r['resource']}")

                    client = boto3.client("dynamodb", region_name=region)

                    logging.info(f"Delete DynamoDB {r['resource']}")

                    response = client.delete_table(TableName=r["resource"])

                elif pt == "resource-groups-group":
                    logging.info(f"Inspect Resource Group {r['resource']}")

                    client = boto3.client("resource-groups",
                                          region_name=region)

                    logging.info(f"Delete Resource Group {r['resource']}")

                    response = client.delete_group(Group=r["arn"])

                elif pt == "iam-role":
                    logging.info(f"Inspect IAM Role {r['resource']}")
                    iam = boto3.resource("iam")
                    role = iam.Role(r["resource"])

                    for policy in role.attached_policies.all():
                        logging.info(f"Detach Role policy {policy.arn}")
                        response = role.detach_policy(PolicyArn=policy.arn)

                    logging.info(f"Delete IAM Role {r['resource']}")
                    role.delete()
Exemple #6
0
def github_auto_provision(config, owner, repo):
    check_cloud_credentials(
        config
    )  # We may need env vars such as AWS_ACCESS_KEY_ID depending on provider

    already_exists = True
    try:
        github.get_repository(owner, repo)
    except requests.exceptions.HTTPError:
        # repo not found
        already_exists = False

    if not already_exists:
        try:
            github.create_repository(
                owner,
                repo,
                description=
                f'QHub {config["project_name"]}-{config["provider"]}',
                homepage=f'https://{config["domain"]}',
            )
        except requests.exceptions.HTTPError as he:
            raise ValueError(
                f"Unable to create GitHub repo https://github.com/{owner}/{repo} - error message from GitHub is: {he}"
            )
    else:
        logger.warn(
            f"GitHub repo https://github.com/{owner}/{repo} already exists")

    try:
        # Secrets
        if config["provider"] == "do":
            for name in {
                    "AWS_ACCESS_KEY_ID",
                    "AWS_SECRET_ACCESS_KEY",
                    "SPACES_ACCESS_KEY_ID",
                    "SPACES_SECRET_ACCESS_KEY",
                    "DIGITALOCEAN_TOKEN",
            }:
                github.update_secret(owner, repo, name, os.environ[name])
        elif config["provider"] == "aws":
            for name in {
                    "AWS_ACCESS_KEY_ID",
                    "AWS_SECRET_ACCESS_KEY",
            }:
                github.update_secret(owner, repo, name, os.environ[name])
        elif config["provider"] == "gcp":
            github.update_secret(owner, repo, "PROJECT_ID",
                                 os.environ["PROJECT_ID"])
            with open(os.environ["GOOGLE_CREDENTIALS"]) as f:
                github.update_secret(owner, repo, "GOOGLE_CREDENTIALS",
                                     f.read())
        elif config["provider"] == "azure":
            for name in {
                    "ARM_CLIENT_ID",
                    "ARM_CLIENT_SECRET",
                    "ARM_SUBSCRIPTION_ID",
                    "ARM_TENANT_ID",
            }:
                github.update_secret(owner, repo, name, os.environ[name])
        github.update_secret(owner, repo, "REPOSITORY_ACCESS_TOKEN",
                             os.environ["GITHUB_TOKEN"])
    except requests.exceptions.HTTPError as he:
        raise ValueError(
            f"Unable to set Secrets on GitHub repo https://github.com/{owner}/{repo} - error message from GitHub is: {he}"
        )

    return f"[email protected]:{owner}/{repo}.git"