Exemple #1
0
def delete_service_by_challenge_pk(challenge):
    """
    Deletes the workers service of a challenge.

    Before deleting, it scales down the number of workers in the service to 0, then proceeds to delete the service.

    Parameters:
    challenge (<class 'challenges.models.Challenge'>): The challenge object for whom the task definition is being registered.

    Returns:
    dict: The response returned by the delete_service method from boto3
    """
    client = get_boto3_client("ecs", aws_keys)
    queue_name = challenge.queue
    service_name = "{}_service".format(queue_name)
    kwargs = delete_service_args.format(CLUSTER=COMMON_SETTINGS_DICT["CLUSTER"], service_name=service_name, force=True)
    kwargs = eval(kwargs)
    try:
        if (challenge.workers != 0):
            response = update_service_by_challenge_pk(client, challenge, 0, False)
            if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK):
                return response

        response = client.delete_service(**kwargs)
        if (response["ResponseMetadata"]["HTTPStatusCode"] == HTTPStatus.OK):
            challenge.workers = None
            challenge.save()
            client.deregister_task_definition(taskDefinition=challenge.task_def_arn)
            challenge.task_def_arn = ""
            challenge.save()
        return response
    except ClientError as e:
        logger.exception(e)
        return e.response
Exemple #2
0
def scale_workers(queryset, num_of_tasks):
    """
    The function called by the admin action method to scale all the selected workers.

    Calls the service_manager method. Before calling, checks if the target scaling number is different than current.

    Parameters:
    queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page.

    Returns:
    dict: keys-> 'count': the number of workers successfully started.
                 'failures': a dict of all the failures with their error messages and the challenge pk
    """
    client = get_boto3_client("ecs", aws_keys)
    count = 0
    failures = []
    for challenge in queryset:
        if (challenge.workers is None):
            response = "Please start worker(s) before scaling."
            failures.append({"message": response, "challenge_pk": challenge.pk})
            continue
        if (num_of_tasks == challenge.workers):
            response = "Please scale to a different number. Challenge has {} worker(s).".format(num_of_tasks)
            failures.append({"message": response, "challenge_pk": challenge.pk})
            continue
        response = service_manager(client, challenge=challenge, num_of_tasks=num_of_tasks)
        if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK):
            failures.append({"message": response['Error'], "challenge_pk": challenge.pk})
            continue
        count += 1
    return {"count": count, "failures": failures}
Exemple #3
0
def generate_presigned_url(file_key_on_s3, challenge_pk):
    """
    Function to get the presigned url to upload a file to s3
    Arguments:
        file_key_on_s3 {string} -- The S3 key for the file to be uploaded
        challenge_pk {int} -- challenge pk for which credentails are to be fetched
    Returns:
        response_data {dict} -- Dict containing the presigned_url or the error if request failed
    """
    if settings.DEBUG or settings.TEST:
        return

    try:
        aws_keys = get_aws_credentials_for_challenge(challenge_pk)

        s3 = get_boto3_client("s3", aws_keys)
        response = s3.generate_presigned_url(
            "put_object",
            Params={
                "Bucket": aws_keys["AWS_STORAGE_BUCKET_NAME"],
                "Key": file_key_on_s3,
            },
            ExpiresIn=settings.PRESIGNED_URL_EXPIRY_TIME,
            HttpMethod="PUT",
        )
        response_data = {"presigned_url": response}
        return response_data
    except Exception as e:
        logger.exception(e)
        response_data = {"error": "Could not fetch presigned url."}
        return response_data
Exemple #4
0
def create_eks_nodegroup(challenge, cluster_name):
    """
    Creates a nodegroup when a EKS cluster is created by the EvalAI admin
    Arguments:
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
        cluster_name {str} -- name of eks cluster
    """
    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    nodegroup_name = "{0}-nodegroup".format(
        challenge_obj.title.replace(" ", "-"))
    client = get_boto3_client("eks", aws_keys)
    # TODO: Move the hardcoded cluster configuration such as the
    # instance_type, subnets, AMI to challenge configuration later.
    try:
        response = client.create_nodegroup(
            clusterName=cluster_name,
            nodegroupName=nodegroup_name,
            scalingConfig={
                "minSize": 1,
                "maxSize": 10,
                "desiredSize": 1
            },
            diskSize=100,
            subnets=[VPC_DICT["SUBNET_1"], VPC_DICT["SUBNET_2"]],
            instanceTypes=["g4dn.xlarge"],
            amiType="AL2_x86_64_GPU",
            nodeRole=settings.EKS_NODEGROUP_ROLE_ARN,
        )
    except ClientError as e:
        logger.exception(e)
        return response
    waiter = client.get_waiter("nodegroup_active")
    waiter.wait(clusterName=cluster_name, nodegroupName=nodegroup_name)
Exemple #5
0
def complete_s3_multipart_file_upload(parts, upload_id, file_key_on_s3,
                                      challenge_pk):
    """
    Function to complete the multipart upload of s3 files using presigned urls
    Arguments:
        parts {List} -- List of S3 ETag and PartNumber for each uploaded chunk
        upload_id {string} -- Unique upload id for multipart file upload
        file_key_on_s3 {string} -- The S3 key for the file to be uploaded
        challenge_pk {int} -- challenge pk for which credentails are to be fetched
    Returns:
        response_data {dict} -- Dict containing the presigned_urls or the error if request failed
    """
    if settings.DEBUG:
        return
    response_data = {}
    try:
        aws_keys = get_aws_credentials_for_challenge(challenge_pk)

        s3 = get_boto3_client("s3", aws_keys)
        response_data = s3.complete_multipart_upload(
            Bucket=aws_keys["AWS_STORAGE_BUCKET_NAME"],
            Key=file_key_on_s3,
            MultipartUpload={"Parts": parts},
            UploadId=upload_id,
        )
    except Exception as e:
        logger.exception(e)
        response_data = {"error": "Could not fetch presigned urls."}
    return response_data
Exemple #6
0
def get_logs_from_cloudwatch(log_group_name, log_stream_prefix, start_time,
                             end_time, pattern):
    """
    To fetch logs of a container from cloudwatch within a specific time frame.
    """
    client = get_boto3_client("logs", aws_keys)
    logs = []
    if settings.DEBUG:
        logs = [
            "The worker logs in the development environment are available on the terminal. Please use docker-compose worker -f to view the logs."
        ]
    else:
        try:
            response = client.filter_log_events(
                logGroupName=log_group_name,
                logStreamNamePrefix=log_stream_prefix,
                startTime=start_time,
                endTime=end_time,
                filterPattern=pattern,
            )
            for event in response["events"]:
                logs.append(event["message"])
        except Exception as e:
            if e.response["Error"]["Code"] == "ResourceNotFoundException":
                return logs

            logger.exception(e)
            return [
                f"There is an error in displaying logs. Please find the full error traceback here {e}"
            ]
    return logs
Exemple #7
0
def start_workers(queryset):
    """
    The function called by the admin action method to start all the selected workers.

    Calls the service_manager method. Before calling, checks if all the workers are incactive.

    Parameters:
    queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page.

    Returns:
    dict: keys-> 'count': the number of workers successfully started.
                 'failures': a dict of all the failures with their error messages and the challenge pk
    """
    client = get_boto3_client("ecs", aws_keys)
    count = 0
    failures = []
    for challenge in queryset:
        if (challenge.workers == 0) or (challenge.workers is None):
            response = service_manager(client,
                                       challenge=challenge,
                                       num_of_tasks=1)
            if response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK:
                failures.append({
                    "message": response["Error"],
                    "challenge_pk": challenge.pk,
                })
                continue
            count += 1
        else:
            response = "Please select challenge with inactive workers only."
            failures.append({
                "message": response,
                "challenge_pk": challenge.pk
            })
    return {"count": count, "failures": failures}
Exemple #8
0
def restart_workers(queryset):
    """
    The function called by the admin action method to restart all the selected workers.

    Calls the delete_service_by_challenge_pk method. Before calling, verifies that the challenge worker(s) is(are) active.

    Parameters:
    queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page.

    Returns:
    dict: keys-> 'count': the number of workers successfully stopped.
                 'failures': a dict of all the failures with their error messages and the challenge pk
    """
    client = get_boto3_client("ecs", aws_keys)
    count = 0
    failures = []
    for challenge in queryset:
        if (challenge.workers is not None) and (challenge.workers > 0):
            response = service_manager(client, challenge, num_of_tasks=challenge.workers, force_new_deployment=True)
            if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK):
                failures.append({"message": response['Error'], "challenge_pk": challenge.pk})
                continue
            count += 1
        else:
            response = "Please select challenges with active workers only."
            failures.append({"message": response, "challenge_pk": challenge.pk})
    return {"count": count, "failures": failures}
Exemple #9
0
def create_eks_nodegroup(challenge, cluster_name):
    """
    Creates a nodegroup when a EKS cluster is created by the EvalAI admin
    Arguments:
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
        cluster_name {str} -- name of eks cluster
    """
    from .utils import get_aws_credentials_for_challenge

    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT)
    nodegroup_name = "{}-{}-nodegroup".format(
        challenge_obj.title.replace(" ", "-"), environment_suffix
    )
    challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk)
    client = get_boto3_client("eks", challenge_aws_keys)
    cluster_meta = get_code_upload_setup_meta_for_challenge(challenge_obj.pk)
    # TODO: Move the hardcoded cluster configuration such as the
    # instance_type, subnets, AMI to challenge configuration later.
    try:
        response = client.create_nodegroup(
            clusterName=cluster_name,
            nodegroupName=nodegroup_name,
            scalingConfig={
                "minSize": challenge_obj.min_worker_instance,
                "maxSize": challenge_obj.max_worker_instance,
                "desiredSize": challenge_obj.desired_worker_instance,
            },
            diskSize=challenge_obj.worker_disk_size,
            subnets=[cluster_meta["SUBNET_1"], cluster_meta["SUBNET_2"]],
            instanceTypes=[challenge_obj.worker_instance_type],
            amiType=challenge_obj.worker_ami_type,
            nodeRole=cluster_meta["EKS_NODEGROUP_ROLE_ARN"],
        )
        logger.info("Nodegroup create: {}".format(response))
    except ClientError as e:
        logger.exception(e)
        return
    waiter = client.get_waiter("nodegroup_active")
    waiter.wait(clusterName=cluster_name, nodegroupName=nodegroup_name)
    construct_and_send_eks_cluster_creation_mail(challenge_obj)
    # starting the code-upload-worker
    client = get_boto3_client("ecs", aws_keys)
    client_token = client_token_generator(challenge_obj.pk)
    create_service_by_challenge_pk(client, challenge_obj, client_token)
Exemple #10
0
def restart_workers(queryset):
    """
    The function called by the admin action method to restart all the selected workers.

    Calls the service_manager method. Before calling, verifies that the challenge worker(s) is(are) active.

    Parameters:
    queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page.

    Returns:
    dict: keys-> 'count': the number of workers successfully stopped.
                 'failures': a dict of all the failures with their error messages and the challenge pk
    """
    if settings.DEBUG:
        failures = []
        for challenge in queryset:
            failures.append(
                {
                    "message": "Workers cannot be restarted on AWS ECS service in development environment",
                    "challenge_pk": challenge.pk,
                }
            )
        return {"count": 0, "failures": failures}

    client = get_boto3_client("ecs", aws_keys)
    count = 0
    failures = []
    for challenge in queryset:
        if (
            challenge.is_docker_based
            and not challenge.is_static_dataset_code_upload
        ):
            response = "Sorry. This feature is not available for code upload/docker based challenges."
            failures.append(
                {"message": response, "challenge_pk": challenge.pk}
            )
        elif (challenge.workers is not None) and (challenge.workers > 0):
            response = service_manager(
                client,
                challenge=challenge,
                num_of_tasks=challenge.workers,
                force_new_deployment=True,
            )
            if response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK:
                failures.append(
                    {
                        "message": response["Error"],
                        "challenge_pk": challenge.pk,
                    }
                )
                continue
            count += 1
        else:
            response = "Please select challenges with active workers only."
            failures.append(
                {"message": response, "challenge_pk": challenge.pk}
            )
    return {"count": count, "failures": failures}
Exemple #11
0
def get_signed_url_for_submission_related_file(request):
    """Returns S3 signed URL for a particular file residing on S3 bucket

    Arguments:
        request {object} -- Request object

    Returns:
        Response object -- Response object with appropriate response code (200/400/403/404)
    """

    # Assumption: file will be stored in this format: 'team_{id}/submission_{id}/.../file.log'
    bucket = request.query_params.get("bucket", None)
    key = request.query_params.get("key", None)

    if not bucket or not key:
        response_data = {"error": "key and bucket names can't be empty"}
        return Response(response_data, status=status.HTTP_400_BAD_REQUEST)

    try:
        splits = key.split("/")
        participant_team_id, submission_id = (
            splits[0].replace("team_", ""),
            splits[1].replace("submission_", ""),
        )
    except Exception:
        response_data = {
            "error":
            "Invalid file path format. Please try again with correct file path format."
        }
        return Response(response_data, status=status.HTTP_400_BAD_REQUEST)

    participant_team = get_participant_team_model(participant_team_id)
    submission = get_submission_model(submission_id)
    challenge_pk = submission.challenge_phase.challenge.pk

    if submission.participant_team != participant_team:
        response_data = {
            "error": "You are not authorized to access this file."
        }
        return Response(response_data, status=status.HTTP_403_FORBIDDEN)

    if is_user_part_of_participant_team(
            request.user, participant_team) or is_user_a_host_of_challenge(
                request.user, challenge_pk):
        aws_keys = get_aws_credentials_for_challenge(challenge_pk)
        s3 = get_boto3_client("s3", aws_keys)
        url = s3.generate_presigned_url(ClientMethod="get_object",
                                        Params={
                                            "Bucket": bucket,
                                            "Key": key
                                        })
        response_data = {"signed_url": url}
        return Response(response_data, status=status.HTTP_200_OK)
    else:
        response_data = {
            "error": "You are not authorized to access this file."
        }
        return Response(response_data, status=status.HTTP_403_FORBIDDEN)
Exemple #12
0
def delete_log_group(log_group_name):
    if settings.DEBUG:
        pass
    else:
        try:
            client = get_boto3_client("logs", aws_keys)
            client.delete_log_group(logGroupName=log_group_name)
        except Exception as e:
            logger.exception(e)
Exemple #13
0
def generate_presigned_url_for_multipart_upload(file_key_on_s3, challenge_pk,
                                                num_parts):
    """
    Function to get the presigned urls to upload a file to s3 in chunks
    Arguments:
        file_key_on_s3 {string} -- The S3 key for the file to be uploaded
        challenge_pk {int} -- challenge pk for which credentails are to be fetched
    Returns:
        response_data {dict} -- Dict containing the presigned_urls or the error if request failed
    """
    if settings.DEBUG:
        return
    response_data = {}
    try:
        aws_keys = get_aws_credentials_for_challenge(challenge_pk)

        s3 = get_boto3_client("s3", aws_keys)
        response = s3.create_multipart_upload(
            Bucket=aws_keys["AWS_STORAGE_BUCKET_NAME"],
            Key=file_key_on_s3,
            ACL="public-read",
        )

        upload_id = response["UploadId"]
        presigned_urls = []
        for part_number in range(1, num_parts + 1):
            presigned_url = s3.generate_presigned_url(
                ClientMethod="upload_part",
                Params={
                    "Bucket": aws_keys["AWS_STORAGE_BUCKET_NAME"],
                    "Key": file_key_on_s3,
                    "UploadId": upload_id,
                    "PartNumber": part_number,
                },
                ExpiresIn=settings.PRESIGNED_URL_EXPIRY_TIME,
            )
            presigned_urls.append({
                "partNumber": part_number,
                "url": presigned_url
            })
        response_data = {
            "presigned_urls": presigned_urls,
            "upload_id": upload_id,
        }
    except Exception as e:
        logger.exception(e)
        response_data = {"error": "Could not fetch presigned urls."}
    return response_data
Exemple #14
0
def get_or_create_ecr_repository(name, aws_keys):
    """Get or create AWS ECR Repository

    Arguments:
        name {string} -- name of ECR repository

    Keyword Arguments:
        aws_keys {dict} -- AWS keys where the ECR repositories will be created

    Returns:
        tuple -- Contains repository dict and boolean field to represent whether ECR repository was created
        Eg: (
                {
                    'repositoryArn': 'arn:aws:ecr:us-east-1:1234567890:repository/some-repository-name',
                    'registryId': '1234567890',
                    'repositoryName': 'some-repository-name',
                    'repositoryUri': '1234567890.dkr.ecr.us-east-1.amazonaws.com/some-repository-name',
                    'createdAt': datetime.datetime(2019, 2, 6, 9, 12, 5, tzinfo=tzlocal())
                },
                False
            )
    """
    repository, created = None, False
    client = get_boto3_client("ecr", aws_keys)
    try:
        response = client.describe_repositories(
            registryId=aws_keys.get("AWS_ACCOUNT_ID"), repositoryNames=[name]
        )
        repository = response["repositories"][0]
    except ClientError as e:
        if (
            e.response["Error"]["Code"] == "RepositoryNotFoundException"
            or e.response["Error"]["Code"] == "400"
        ):
            response = client.create_repository(repositoryName=name)
            repository = response["repository"]
            created = True
        else:
            logger.exception(e)
    return (repository, created)
Exemple #15
0
def create_federated_user(name, repository, aws_keys):
    """Create AWS federated user

    Arguments:
        name {string} -- Name of participant team for which federated user is to be created
        repository {string} -- Name of the AWS ECR repository to which user should be granted permission

    Returns:
        dict -- Dict containing user related credentials such as access_key_id, access_secret etc.
        Eg:
        {
            'Credentials': {
                'AccessKeyId': 'ABCDEFGHIJKLMNOPQRTUVWXYZ',
                'SecretAccessKey': 'NMgBB75gfVBCDEFGHIJK8g00qVyyzQW+4XjJGQALMNOPQRSTUV',
                'SessionToken': 'FQoGZX.....',
                'Expiration': datetime.datetime(2019, 2, 7, 5, 43, 58, tzinfo=tzutc())
            },
            'FederatedUser': {
                'FederatedUserId': '1234567890:test-user',
                'Arn': 'arn:aws:sts::1234567890:federated-user/test-user'
            },
            'PackedPolicySize': 28,
            'ResponseMetadata': {
                'RequestId': 'fb47f78b-2a92-11e9-84b9-33527429b818',
                'HTTPStatusCode': 200,
                'HTTPHeaders': {
                'x-amzn-requestid': 'fb47f78b-2a92-11e9-84b9-33527429b818',
                'content-type': 'text/xml',
                'content-length': '1245',
                'date': 'Thu, 07 Feb 2019 04:43:57 GMT'
                },
                'RetryAttempts': 0
            }
        }
    """
    AWS_ACCOUNT_ID = aws_keys.get("AWS_ACCOUNT_ID")
    AWS_REGION = aws_keys.get("AWS_REGION")
    policy = {
        "Version":
        "2012-10-17",
        "Statement": [
            {
                "Effect":
                "Allow",
                "Action":
                "ecr:*",
                "Resource":
                "arn:aws:ecr:{}:{}:repository/{}".format(
                    AWS_REGION, AWS_ACCOUNT_ID, repository),
            },
            {
                "Effect": "Allow",
                "Action": ["ecr:GetAuthorizationToken"],
                "Resource": "*",
            },
        ],
    }
    client = get_boto3_client("sts", aws_keys)
    response = client.get_federation_token(
        Name=convert_to_aws_federated_user_format(name),
        Policy=json.dumps(policy),
        DurationSeconds=43200,
    )
    return response
Exemple #16
0
def create_eks_cluster(challenge):
    """
    Called when Challenge is approved by the EvalAI admin
    calls the create_eks_nodegroup function

    Arguments:
        sender {type} -- model field called the post hook
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
    """
    from .models import ChallengeEvaluationCluster

    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    cluster_name = "{0}-cluster".format(challenge_obj.title.replace(" ", "-"))
    if challenge_obj.approved_by_admin and challenge_obj.is_docker_based:
        client = get_boto3_client("eks", aws_keys)
        try:
            response = client.create_cluster(
                name=cluster_name,
                version="1.15",
                roleArn=settings.EKS_CLUSTER_ROLE_ARN,
                resourcesVpcConfig={
                    "subnetIds": [VPC_DICT["SUBNET_1"], VPC_DICT["SUBNET_2"]],
                    "securityGroupIds": [VPC_DICT["SUBNET_SECURITY_GROUP"]],
                },
            )
            waiter = client.get_waiter("cluster_active")
            waiter.wait(name=cluster_name)
            # creating kubeconfig
            cluster = client.describe_cluster(name=cluster_name)
            cluster_cert = cluster["cluster"]["certificateAuthority"]["data"]
            cluster_ep = cluster["cluster"]["endpoint"]
            cluster_config = {
                "apiVersion":
                "v1",
                "kind":
                "Config",
                "clusters": [{
                    "cluster": {
                        "server": str(cluster_ep),
                        "certificate-authority-data": str(cluster_cert),
                    },
                    "name": "kubernetes",
                }],
                "contexts": [{
                    "context": {
                        "cluster": "kubernetes",
                        "user": "******"
                    },
                    "name": "aws",
                }],
                "current-context":
                "aws",
                "preferences": {},
                "users": [{
                    "name": "aws",
                    "user": {
                        "exec": {
                            "apiVersion":
                            "client.authentication.k8s.io/v1alpha1",
                            "command": "heptio-authenticator-aws",
                            "args": ["token", "-i", cluster_name],
                        }
                    },
                }],
            }

            # Write in YAML.
            config_text = yaml.dump(cluster_config, default_flow_style=False)
            config_file = NamedTemporaryFile(delete=True)
            config_file.write(config_text.encode())
            ChallengeEvaluationCluster.objects.create(
                challenge=challenge_obj,
                name=cluster_name,
                cluster_endpoint=cluster_ep,
                cluster_ssl=cluster_cert,
            )
            # Creating nodegroup
            create_eks_nodegroup.delay(challenge, cluster_name)
            return response
        except ClientError as e:
            logger.exception(e)
            return
Exemple #17
0
def create_eks_cluster_subnets(challenge):
    """
    Creates EKS and NodeGroup ARN roles

    Arguments:
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
    """
    from .models import ChallengeEvaluationCluster
    from .serializers import ChallengeEvaluationClusterSerializer
    from .utils import get_aws_credentials_for_challenge

    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk)
    environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT)
    client = get_boto3_client("ec2", challenge_aws_keys)
    vpc_ids = []
    try:
        response = client.create_vpc(CidrBlock=challenge_obj.vpc_cidr)
        vpc_ids.append(response["Vpc"]["VpcId"])
    except ClientError as e:
        logger.exception(e)
        return

    waiter = client.get_waiter("vpc_available")
    waiter.wait(VpcIds=vpc_ids)

    # Create internet gateway and attach to vpc
    try:
        # Enable DNS resolution for VPC
        response = client.modify_vpc_attribute(
            EnableDnsHostnames={"Value": True}, VpcId=vpc_ids[0])

        response = client.create_internet_gateway()
        internet_gateway_id = response["InternetGateway"]["InternetGatewayId"]
        client.attach_internet_gateway(InternetGatewayId=internet_gateway_id,
                                       VpcId=vpc_ids[0])

        # Create and attach route table
        response = client.create_route_table(VpcId=vpc_ids[0])
        route_table_id = response["RouteTable"]["RouteTableId"]
        client.create_route(
            DestinationCidrBlock="0.0.0.0/0",
            GatewayId=internet_gateway_id,
            RouteTableId=route_table_id,
        )

        # Create subnets
        subnet_ids = []
        response = client.create_subnet(
            CidrBlock=challenge_obj.subnet_1_cidr,
            AvailabilityZone="us-east-1a",
            VpcId=vpc_ids[0],
        )
        subnet_1_id = response["Subnet"]["SubnetId"]
        subnet_ids.append(subnet_1_id)

        response = client.create_subnet(
            CidrBlock=challenge_obj.subnet_2_cidr,
            AvailabilityZone="us-east-1b",
            VpcId=vpc_ids[0],
        )
        subnet_2_id = response["Subnet"]["SubnetId"]
        subnet_ids.append(subnet_2_id)

        waiter = client.get_waiter("subnet_available")
        waiter.wait(SubnetIds=subnet_ids)

        # Creating managed node group needs subnets to auto assign ip v4
        for subnet_id in subnet_ids:
            response = client.modify_subnet_attribute(
                MapPublicIpOnLaunch={
                    "Value": True,
                },
                SubnetId=subnet_id,
            )

        # Associate route table with subnets
        response = client.associate_route_table(
            RouteTableId=route_table_id,
            SubnetId=subnet_1_id,
        )

        response = client.associate_route_table(
            RouteTableId=route_table_id,
            SubnetId=subnet_2_id,
        )

        # Create security group
        response = client.create_security_group(
            GroupName="EvalAI code upload challenge",
            Description="EvalAI code upload challenge worker group",
            VpcId=vpc_ids[0],
        )
        security_group_id = response["GroupId"]

        response = client.create_security_group(
            GroupName="evalai-code-upload-challenge-efs-{}".format(
                environment_suffix),
            Description="EKS nodegroup EFS",
            VpcId=vpc_ids[0],
        )
        efs_security_group_id = response["GroupId"]

        response = client.authorize_security_group_ingress(
            GroupId=efs_security_group_id,
            IpPermissions=[{
                "FromPort": 2049,
                "IpProtocol": "tcp",
                "IpRanges": [
                    {
                        "CidrIp": challenge_obj.vpc_cidr,
                    },
                ],
                "ToPort": 2049,
            }],
        )

        # Create EFS
        efs_client = get_boto3_client("efs", challenge_aws_keys)
        efs_creation_token = str(uuid.uuid4())[:64]
        response = efs_client.create_file_system(
            CreationToken=efs_creation_token, )
        efs_id = response["FileSystemId"]

        challenge_evaluation_cluster = ChallengeEvaluationCluster.objects.get(
            challenge=challenge_obj)
        serializer = ChallengeEvaluationClusterSerializer(
            challenge_evaluation_cluster,
            data={
                "vpc_id": vpc_ids[0],
                "internet_gateway_id": internet_gateway_id,
                "route_table_id": route_table_id,
                "security_group_id": security_group_id,
                "subnet_1_id": subnet_1_id,
                "subnet_2_id": subnet_2_id,
                "efs_security_group_id": efs_security_group_id,
                "efs_id": efs_id,
                "efs_creation_token": efs_creation_token,
            },
            partial=True,
        )
        if serializer.is_valid():
            serializer.save()
        # Create eks cluster
        create_eks_cluster.delay(challenge)
    except ClientError as e:
        logger.exception(e)
        return
Exemple #18
0
def create_eks_cluster(challenge):
    """
    Called when Challenge is approved by the EvalAI admin
    calls the create_eks_nodegroup function

    Arguments:
        sender {type} -- model field called the post hook
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
    """
    from .models import ChallengeEvaluationCluster
    from .serializers import ChallengeEvaluationClusterSerializer
    from .utils import get_aws_credentials_for_challenge

    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT)
    cluster_name = "{}-{}-cluster".format(
        challenge_obj.title.replace(" ", "-"), environment_suffix)
    if challenge_obj.approved_by_admin and challenge_obj.is_docker_based:
        challenge_aws_keys = get_aws_credentials_for_challenge(
            challenge_obj.pk)
        client = get_boto3_client("eks", challenge_aws_keys)
        cluster_meta = get_code_upload_setup_meta_for_challenge(
            challenge_obj.pk)
        try:
            response = client.create_cluster(
                name=cluster_name,
                version="1.16",
                roleArn=cluster_meta["EKS_CLUSTER_ROLE_ARN"],
                resourcesVpcConfig={
                    "subnetIds": [
                        cluster_meta["SUBNET_1"],
                        cluster_meta["SUBNET_2"],
                    ],
                    "securityGroupIds":
                    [cluster_meta["SUBNET_SECURITY_GROUP"]],
                },
            )
            waiter = client.get_waiter("cluster_active")
            waiter.wait(name=cluster_name)
            # creating kubeconfig
            cluster = client.describe_cluster(name=cluster_name)
            cluster_cert = cluster["cluster"]["certificateAuthority"]["data"]
            cluster_ep = cluster["cluster"]["endpoint"]
            cluster_config = {
                "apiVersion":
                "v1",
                "kind":
                "Config",
                "clusters": [{
                    "cluster": {
                        "server": str(cluster_ep),
                        "certificate-authority-data": str(cluster_cert),
                    },
                    "name": "kubernetes",
                }],
                "contexts": [{
                    "context": {
                        "cluster": "kubernetes",
                        "user": "******"
                    },
                    "name": "aws",
                }],
                "current-context":
                "aws",
                "preferences": {},
                "users": [{
                    "name": "aws",
                    "user": {
                        "exec": {
                            "apiVersion":
                            "client.authentication.k8s.io/v1alpha1",
                            "command": "heptio-authenticator-aws",
                            "args": ["token", "-i", cluster_name],
                        }
                    },
                }],
            }

            # Write in YAML.
            config_text = yaml.dump(cluster_config, default_flow_style=False)
            config_file = NamedTemporaryFile(delete=True)
            config_file.write(config_text.encode())
            challenge_evaluation_cluster = (
                ChallengeEvaluationCluster.objects.get(
                    challenge=challenge_obj))

            efs_client = get_boto3_client("efs", challenge_aws_keys)
            # Create mount targets for subnets
            mount_target_ids = []
            response = efs_client.create_mount_target(
                FileSystemId=challenge_evaluation_cluster.efs_id,
                SubnetId=challenge_evaluation_cluster.subnet_1_id,
                SecurityGroups=[
                    challenge_evaluation_cluster.efs_security_group_id
                ],
            )
            mount_target_ids.append(response["MountTargetId"])

            response = efs_client.create_mount_target(
                FileSystemId=challenge_evaluation_cluster.efs_id,
                SubnetId=challenge_evaluation_cluster.subnet_2_id,
                SecurityGroups=[
                    challenge_evaluation_cluster.efs_security_group_id
                ],
            )
            mount_target_ids.append(response["MountTargetId"])

            serializer = ChallengeEvaluationClusterSerializer(
                challenge_evaluation_cluster,
                data={
                    "name": cluster_name,
                    "cluster_endpoint": cluster_ep,
                    "cluster_ssl": cluster_cert,
                    "efs_mount_target_ids": mount_target_ids,
                },
                partial=True,
            )
            if serializer.is_valid():
                serializer.save()
            # Creating nodegroup
            create_eks_nodegroup.delay(challenge, cluster_name)
            return response
        except ClientError as e:
            logger.exception(e)
            return
Exemple #19
0
def setup_eks_cluster(challenge):
    """
    Creates EKS and NodeGroup ARN roles

    Arguments:
        instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook
    """
    from .models import ChallengeEvaluationCluster
    from .serializers import ChallengeEvaluationClusterSerializer
    from .utils import get_aws_credentials_for_challenge

    for obj in serializers.deserialize("json", challenge):
        challenge_obj = obj.object
    challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk)
    client = get_boto3_client("iam", challenge_aws_keys)
    environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT)
    eks_role_name = "evalai-code-upload-eks-role-{}".format(environment_suffix)
    eks_arn_role = None
    try:
        response = client.create_role(
            RoleName=eks_role_name,
            Description="Amazon EKS cluster role with managed policy",
            AssumeRolePolicyDocument=json.dumps(
                settings.EKS_CLUSTER_TRUST_RELATION),
        )
        eks_arn_role = response["Role"]["Arn"]
    except ClientError as e:
        logger.exception(e)
        return
    waiter = client.get_waiter("role_exists")
    waiter.wait(RoleName=eks_role_name)

    try:
        # Attach AWS managed EKS cluster policy to the role
        response = client.attach_role_policy(
            RoleName=eks_role_name,
            PolicyArn=settings.EKS_CLUSTER_POLICY,
        )
    except ClientError as e:
        logger.exception(e)
        return

    node_group_role_name = "evalai-code-upload-nodegroup-role-{}".format(
        environment_suffix)
    node_group_arn_role = None
    try:
        response = client.create_role(
            RoleName=node_group_role_name,
            Description="Amazon EKS node group role with managed policy",
            AssumeRolePolicyDocument=json.dumps(
                settings.EKS_NODE_GROUP_TRUST_RELATION),
        )
        node_group_arn_role = response["Role"]["Arn"]
    except ClientError as e:
        logger.exception(e)
        return
    waiter = client.get_waiter("role_exists")
    waiter.wait(RoleName=node_group_role_name)

    task_execution_policies = settings.EKS_NODE_GROUP_POLICIES
    for policy_arn in task_execution_policies:
        try:
            # Attach AWS managed EKS worker node policy to the role
            response = client.attach_role_policy(
                RoleName=node_group_role_name,
                PolicyArn=policy_arn,
            )
        except ClientError as e:
            logger.exception(e)
            return

    # Create custom ECR all access policy and attach to node_group_role
    ecr_all_access_policy_name = "AWS-ECR-Full-Access-{}".format(
        environment_suffix)
    ecr_all_access_policy_arn = None
    try:
        response = client.create_policy(
            PolicyName=ecr_all_access_policy_name,
            PolicyDocument=json.dumps(settings.ECR_ALL_ACCESS_POLICY_DOCUMENT),
        )
        ecr_all_access_policy_arn = response["Policy"]["Arn"]
        waiter = client.get_waiter("policy_exists")
        waiter.wait(PolicyArn=ecr_all_access_policy_arn)
        # Attach custom ECR policy
        response = client.attach_role_policy(
            RoleName=node_group_role_name, PolicyArn=ecr_all_access_policy_arn)
    except ClientError as e:
        logger.exception(e)
        return
    try:
        challenge_evaluation_cluster = ChallengeEvaluationCluster.objects.get(
            challenge=challenge_obj)
        serializer = ChallengeEvaluationClusterSerializer(
            challenge_evaluation_cluster,
            data={
                "eks_arn_role": eks_arn_role,
                "node_group_arn_role": node_group_arn_role,
                "ecr_all_access_policy_arn": ecr_all_access_policy_arn,
            },
            partial=True,
        )
        if serializer.is_valid():
            serializer.save()
        # Create eks cluster vpc and subnets
        create_eks_cluster_subnets.delay(challenge)
    except Exception as e:
        logger.exception(e)
        return