def guided_install(config, dns_provider, dns_auto_provision, disable_prompt=False): # 01 Verify configuration file exists verify_configuration_file_exists() # 02 Check terraform check_terraform() # 03 Check Environment Variables check_cloud_credentials(config) # 04 Check that oauth settings are set if not disable_prompt: input( 'Ensure that oauth settings are in configuration [Press "Enter" to continue]' ) # 05 Create terraform backend remote state bucket with change_directory("terraform-state"): run(["terraform", "init"]) run(["terraform", "apply", "-auto-approve"]) # 06 Create qhub initial state (up to nginx-ingress) with change_directory("infrastructure"): run(["terraform", "init"]) run( [ "terraform", "apply", "-auto-approve", "-target=module.kubernetes", "-target=module.kubernetes-initialization", "-target=module.kubernetes-ingress", ] ) cmd_output = check_output(["terraform", "output", "--json"]) # This is a bit ugly, but the issue we have at the moment is being unable # to parse cmd_output as json on Github Actions. ip_matches = re.findall(rb'"ip": "(?!string)(.*)"', cmd_output) if ip_matches: ip = ip_matches[0].decode() else: raise ValueError(f"IP Address not found in: {cmd_output}") # 07 Update DNS to point to qhub deployment if dns_auto_provision and dns_provider == "cloudflare": record_name, zone_name = ( config["domain"].split(".")[:-2], config["domain"].split(".")[-2:], ) record_name = f'jupyter.{".".join(record_name)}' zone_name = ".".join(zone_name) if config["provider"] in {"do", "gcp"}: update_record(zone_name, record_name, "A", ip) else: input( f'Take IP Address {ip} and update DNS to point to "jupyter.{config["domain"]}" [Press Enter when Complete]' ) # 08 Full deploy QHub with change_directory("infrastructure"): run(["terraform", "apply", "-auto-approve"])
def destroy_configuration(config): logger.info( """Removing all infrastructure, your local files will still remain, \n you can use 'qhub deploy' to re - install infrastructure using same config file""" ) with timer(logger, "destroying QHub"): # 01 Verify configuration file exists verify_configuration_file_exists() # 02 Check terraform check_terraform() # 03 Check Environment Variables check_cloud_credentials(config) # 04 Remove all infrastructure with change_directory("infrastructure"): run(["terraform", "destroy", "-auto-approve"]) # 06 Remove terraform backend remote state bucket with change_directory("terraform-state"): run(["terraform", "destroy", "-auto-approve"])
def guided_install( config, dns_provider, dns_auto_provision, disable_prompt=False, skip_remote_state_provision=False, full_only=False, ): # 01 Check Environment Variables check_cloud_credentials(config) # Check that secrets required for terraform # variables are set as required check_secrets(config) # 02 Create terraform backend remote state bucket # backwards compatible with `qhub-config.yaml` which # don't have `terraform_state` key if ( (not skip_remote_state_provision) and (config.get("terraform_state", {}).get("type", "") == "remote") and (config.get("provider") != "local") ): terraform_state_sync(config) # 3 kubernetes-alpha provider requires that kubernetes be # provisionioned before any "kubernetes_manifests" resources logger.info("Running terraform init") terraform.init(directory="infrastructure") if not full_only: targets = [ "module.kubernetes", "module.kubernetes-initialization", ] logger.info(f"Running Terraform Stage: {targets}") terraform.apply( directory="infrastructure", targets=targets, ) # 04 Create qhub initial state (up to nginx-ingress) targets = ["module.kubernetes-ingress"] logger.info(f"Running Terraform Stage: {targets}") terraform.apply( directory="infrastructure", targets=targets, ) cmd_output = terraform.output(directory="infrastructure") # This is a bit ugly, but the issue we have at the moment is being unable # to parse cmd_output as json on Github Actions. ip_matches = re.findall(r'"ip": "(?!string)(.+)"', cmd_output) hostname_matches = re.findall(r'"hostname": "(?!string)(.+)"', cmd_output) if ip_matches: ip_or_hostname = ip_matches[0] elif hostname_matches: ip_or_hostname = hostname_matches[0] else: raise ValueError(f"IP Address not found in: {cmd_output}") # 05 Update DNS to point to qhub deployment if dns_auto_provision and dns_provider == "cloudflare": record_name, zone_name = ( config["domain"].split(".")[:-2], config["domain"].split(".")[-2:], ) record_name = ".".join(record_name) zone_name = ".".join(zone_name) if config["provider"] in {"do", "gcp", "azure"}: update_record(zone_name, record_name, "A", ip_or_hostname) if config.get("clearml", {}).get("enabled"): add_clearml_dns(zone_name, record_name, "A", ip_or_hostname) elif config["provider"] == "aws": update_record(zone_name, record_name, "CNAME", ip_or_hostname) if config.get("clearml", {}).get("enabled"): add_clearml_dns(zone_name, record_name, "CNAME", ip_or_hostname) else: logger.info( f"Couldn't update the DNS record for cloud provider: {config['provider']}" ) elif not disable_prompt: input( f"Take IP Address {ip_or_hostname} and update DNS to point to " f'"{config["domain"]}" [Press Enter when Complete]' ) # Now Keycloak Helm chart (External Docker Registry before that if we need one) targets = ["module.external-container-reg", "module.kubernetes-keycloak-helm"] logger.info(f"Running Terraform Stage: {targets}") terraform.apply( directory="infrastructure", targets=targets, ) # Now Keycloak realm and config targets = ["module.kubernetes-keycloak-config"] logger.info(f"Running Terraform Stage: {targets}") terraform.apply( directory="infrastructure", targets=targets, ) # Full deploy QHub logger.info("Running Terraform Stage: FULL") terraform.apply(directory="infrastructure")
def destroy_configuration(config, skip_remote_state_provision=False, full_only=False): logger.info( """Removing all infrastructure, your local files will still remain, you can use 'qhub deploy' to re-install infrastructure using same config file\n""" ) with timer(logger, "destroying QHub"): # 01 Check Environment Variables check_cloud_credentials(config) # 02 Remove all infrastructure terraform.init(directory="infrastructure") terraform.refresh(directory="infrastructure") if not full_only: stages = ( { "name": "General cluster software", "targets": [ "module.kubernetes-nfs-mount", "module.kubernetes-nfs-server", "module.kubernetes-nfs-mount", "module.kubernetes-conda-store-server", "module.kubernetes-conda-store-mount", "module.kubernetes-autoscaling", "module.qhub", "module.prefect", "module.monitoring", "module.clearml", "module.forwardauth", "random_password.jupyterhub-jhsecret", "random_password.forwardauth-jhsecret", "kubernetes_secret.qhub_yaml_secret", ] + [ f"module.{helmext['name']}-extension" for helmext in config.get("helm_extensions", []) ] + [ f"module.ext-{ext['name']}" for ext in config.get("extensions", []) ], }, { "name": "Keycloak Config", "targets": [ "module.kubernetes-keycloak-config", "random_password.keycloak-qhub-bot-password", ], }, { "name": "Keycloak Helm installation", "targets": ["module.kubernetes-keycloak-helm"], }, { "name": "Kubernetes Ingress", "targets": ["module.kubernetes-ingress"], }, { "name": "Kubernetes Cluster", "targets": [ "module.kubernetes", "module.kubernetes-initialization", ], }, { "name": "Cloud Infrastructure", "targets": [ "module.registry-jupyterhub", # GCP "module.efs", # AWS "module.registry-jupyterlab", # AWS "module.network", # AWS "module.accounting", # AWS "module.registry", # Azure ], }, ) for stageinfo in stages: logger.info( f"Running Terraform Stage: {stageinfo['name']} {stageinfo['targets']}" ) terraform.destroy(directory="infrastructure", targets=stageinfo["targets"]) else: logger.info("Running Terraform Stage: FULL") terraform.destroy(directory="infrastructure") # 03 Remove terraform backend remote state bucket # backwards compatible with `qhub-config.yaml` which # don't have `terraform_state` key if ((not skip_remote_state_provision) and (config.get("terraform_state", {}).get("type", "") == "remote") and (config.get("provider") != "local")): terraform_state_sync(config) terraform.destroy(directory="terraform-state")
def force_destroy_configuration(config): logging.info( """FORCE Removing all infrastructure (not using terraform).""") with timer(logging, "destroying QHub"): # 01 Check we have cloud details we need check_cloud_credentials(config) if config.get("provider", "") != "aws": raise ValueError("force-destroy currently only available for AWS") project_name = config.get("project_name", "").strip() if project_name == "": raise ValueError("project_name cannot be blank") if "amazon_web_services" not in config: raise ValueError( "amazon_web_services section must exist in qhub-config.yaml") region = config["amazon_web_services"].get("region", "").strip() if region == "": raise ValueError( "amazon_web_services.region must exist in qhub-config.yaml") logging.info(f"Remove AWS project {project_name} in region {region}") env = config.get("namespace", "dev").strip() # 02 Remove all infrastructure try: import boto3 except ImportError: raise ValueError( "Please ensure boto3 package is installed using: pip install boto3==1.17.98" ) restag = boto3.client("resourcegroupstaggingapi", region_name=region) filter_params = dict( TagFilters=[ { "Key": "Owner", "Values": [ "terraform", "terraform-state", ], }, { "Key": "Environment", "Values": [ env, ], }, { "Key": "Project", "Values": [ project_name, ], }, ], ResourcesPerPage=50, ) resources = [] response = restag.get_resources(**filter_params) resources.extend(response["ResourceTagMappingList"]) while "PaginationToken" in response and response["PaginationToken"]: token = response["PaginationToken"] response = restag.get_resources(**filter_params, PaginationToken=token) resources.extend(response["ResourceTagMappingList"]) # Load Balancer and other K8s-generated resources will need to be queried separately: filter_params = dict( TagFilters=[{ "Key": f"kubernetes.io/cluster/{project_name}-{env}", "Values": [ "owned", ], }], ResourcesPerPage=50, ) response = restag.get_resources(**filter_params) resources.extend(response["ResourceTagMappingList"]) # IAM iam = boto3.resource("iam") for suffix in ("eks-cluster-role", "eks-node-group-role"): try: role = iam.Role(f"{project_name}-{env}-{suffix}") if role.tags is not None: tags_dict = dict([(t["Key"], t.get("Value", "")) for t in role.tags]) if (tags_dict.get("Owner", "") == "terraform" and tags_dict.get("Environment", "") == env and tags_dict.get("Project", "") == project_name): resources.append({"ResourceARN": role.arn}) except iam.meta.client.exceptions.NoSuchEntityException: pass # Summarize resources type_groups = {} for r in resources: de_arned = parse_arn(r["ResourceARN"]) t = f"{de_arned['service']}-{de_arned['resource_type']}" type_groups.setdefault(t, []).append(de_arned) logging.info(r["ResourceARN"]) logging.info([(k, len(v)) for k, v in type_groups.items()]) # Order priority_types = ( "eks-nodegroup", "eks-cluster", "elasticloadbalancing-loadbalancer", "ec2-internet-gateway", "ec2-route-table", "elasticfilesystem-file-system", "ec2-subnet", "ec2-security-group", "ec2-vpc", "ecr-repository", "dynamodb-table", "s3-None", "resource-groups-group", "iam-role", ) for pt in priority_types: logging.info(f"Inspect {pt}") for r in type_groups.get(pt, []): if pt == "eks-nodegroup": nodegroup_resource = r["resource"].split("/") cluster_name = nodegroup_resource[0] nodegroup_name = nodegroup_resource[1] logging.info( f"Delete {nodegroup_name} on cluster {cluster_name}") client = boto3.client("eks", region_name=region) client.delete_nodegroup(clusterName=cluster_name, nodegroupName=nodegroup_name) elif pt == "eks-cluster": logging.info(f"Delete EKS cluster {r['resource']}") client = boto3.client("eks", region_name=region) response = client.list_nodegroups( clusterName=r["resource"]) while len(response["nodegroups"]) > 0: logging.info("Nodegroups still present, sleep 10") time.sleep(10) response = client.list_nodegroups( clusterName=r["resource"]) client.delete_cluster(name=r["resource"]) elif pt == "elasticloadbalancing-loadbalancer": client = boto3.client("elb", region_name=region) logging.info(f"Inspect Load balancer {r['resource']}") logging.info(f"Delete Load balancer {r['resource']}") response = client.delete_load_balancer( LoadBalancerName=r["resource"]) elif pt == "ec2-route-table": logging.info(f"Inspect route table {r['resource']}") ec2 = boto3.resource("ec2", region_name=region) route_table = ec2.RouteTable(r["resource"]) for assoc in route_table.associations: logging.info(f"Delete route table assoc {assoc.id}") assoc.delete() time.sleep(10) logging.info(f"Delete route table {r['resource']}") route_table.delete() elif pt == "ec2-subnet": logging.info(f"Inspect subnet {r['resource']}") ec2 = boto3.resource("ec2", region_name=region) subnet = ec2.Subnet(r["resource"]) for ni in subnet.network_interfaces.all(): ni.load() # But can only detach if attached... if ni.attachment: ni.detach(DryRun=False, Force=True) ni.delete() logging.info(f"Delete subnet {r['resource']}") subnet.delete(DryRun=False) elif pt == "ec2-security-group": logging.info(f"Inspect security group {r['resource']}") ec2 = boto3.resource("ec2", region_name=region) security_group = ec2.SecurityGroup(r["resource"]) for ipperms in security_group.ip_permissions_egress: security_group.revoke_egress(DryRun=False, IpPermissions=[ipperms]) for ipperms in security_group.ip_permissions: security_group.revoke_ingress(DryRun=False, IpPermissions=[ipperms]) logging.info(f"Delete security group {r['resource']}") security_group.delete(DryRun=False) elif pt == "ec2-internet-gateway": logging.info(f"Inspect internet gateway {r['resource']}") ec2 = boto3.resource("ec2", region_name=region) internet_gateway = ec2.InternetGateway(r["resource"]) for attach in internet_gateway.attachments: logging.info( f"Inspect IG attachment {attach['VpcId']}") if attach.get("State", "") == "available": logging.info(f"Detach from VPC {attach['VpcId']}") internet_gateway.detach_from_vpc( VpcId=attach["VpcId"]) time.sleep(10) logging.info(f"Delete internet gateway {r['resource']}") internet_gateway.delete(DryRun=False) elif pt == "elasticfilesystem-file-system": client = boto3.client("efs", region_name=region) logging.info(f"Delete efs {r['resource']}") mts = client.describe_mount_targets( FileSystemId=r["resource"]) for mt in mts["MountTargets"]: client.delete_mount_target( MountTargetId=mt["MountTargetId"]) response = client.delete_file_system( FileSystemId=r["resource"]) ## Should wait until this returns botocore.errorfactory.FileSystemNotFound: # response = client.describe_file_systems( # FileSystemId=r['resource'] # ) elif pt == "ec2-vpc": logging.info(f"Inspect VPC {r['resource']}") ec2 = boto3.resource("ec2", region_name=region) vpc = ec2.Vpc(r["resource"]) # for cidr_assoc in vpc.cidr_block_association_set: # logging.info(cidr_assoc) # r = vpc.disassociate_subnet_cidr_block( # AssociationId=cidr_assoc['AssociationId'] # ) # logging.info(r) logging.info(f"Delete VPC {r['resource']}") vpc.delete() elif pt == "ecr-repository": logging.info(f"Inspect ECR {r['resource']}") client = boto3.client("ecr", region_name=region) logging.info( f"Delete ecr {r['account']} / {r['resource']}") response = response = client.delete_repository( registryId=r["account"], repositoryName=r["resource"], force=True, ) elif pt == "s3-None": logging.info(f"Inspect S3 {r['resource']}") s3 = boto3.resource("s3", region_name=region) logging.info(f"Delete s3 {r['resource']}") bucket = s3.Bucket(r["resource"]) r = bucket.objects.all().delete() r = bucket.object_versions.delete() response = bucket.delete() elif pt == "dynamodb-table": logging.info(f"Inspect DynamoDB {r['resource']}") client = boto3.client("dynamodb", region_name=region) logging.info(f"Delete DynamoDB {r['resource']}") response = client.delete_table(TableName=r["resource"]) elif pt == "resource-groups-group": logging.info(f"Inspect Resource Group {r['resource']}") client = boto3.client("resource-groups", region_name=region) logging.info(f"Delete Resource Group {r['resource']}") response = client.delete_group(Group=r["arn"]) elif pt == "iam-role": logging.info(f"Inspect IAM Role {r['resource']}") iam = boto3.resource("iam") role = iam.Role(r["resource"]) for policy in role.attached_policies.all(): logging.info(f"Detach Role policy {policy.arn}") response = role.detach_policy(PolicyArn=policy.arn) logging.info(f"Delete IAM Role {r['resource']}") role.delete()
def github_auto_provision(config, owner, repo): check_cloud_credentials( config ) # We may need env vars such as AWS_ACCESS_KEY_ID depending on provider already_exists = True try: github.get_repository(owner, repo) except requests.exceptions.HTTPError: # repo not found already_exists = False if not already_exists: try: github.create_repository( owner, repo, description= f'QHub {config["project_name"]}-{config["provider"]}', homepage=f'https://{config["domain"]}', ) except requests.exceptions.HTTPError as he: raise ValueError( f"Unable to create GitHub repo https://github.com/{owner}/{repo} - error message from GitHub is: {he}" ) else: logger.warn( f"GitHub repo https://github.com/{owner}/{repo} already exists") try: # Secrets if config["provider"] == "do": for name in { "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "SPACES_ACCESS_KEY_ID", "SPACES_SECRET_ACCESS_KEY", "DIGITALOCEAN_TOKEN", }: github.update_secret(owner, repo, name, os.environ[name]) elif config["provider"] == "aws": for name in { "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", }: github.update_secret(owner, repo, name, os.environ[name]) elif config["provider"] == "gcp": github.update_secret(owner, repo, "PROJECT_ID", os.environ["PROJECT_ID"]) with open(os.environ["GOOGLE_CREDENTIALS"]) as f: github.update_secret(owner, repo, "GOOGLE_CREDENTIALS", f.read()) elif config["provider"] == "azure": for name in { "ARM_CLIENT_ID", "ARM_CLIENT_SECRET", "ARM_SUBSCRIPTION_ID", "ARM_TENANT_ID", }: github.update_secret(owner, repo, name, os.environ[name]) github.update_secret(owner, repo, "REPOSITORY_ACCESS_TOKEN", os.environ["GITHUB_TOKEN"]) except requests.exceptions.HTTPError as he: raise ValueError( f"Unable to set Secrets on GitHub repo https://github.com/{owner}/{repo} - error message from GitHub is: {he}" ) return f"[email protected]:{owner}/{repo}.git"