def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type,
                 ec2_key_name, ec2_instance_role_name, ec2_instance_ami,
                 region):
    print(f"Creating instance: CI-CD {ec2_key_name}")
    key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
    params = {
        "KeyName":
        ec2_key_name,
        "ImageId":
        ec2_instance_ami,
        "InstanceType":
        ec2_instance_type,
        "IamInstanceProfile": {
            "Name": ec2_instance_role_name
        },
        "TagSpecifications": [
            {
                "ResourceType": "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": f"CI-CD {ec2_key_name}"
                }]
            },
        ],
        "MaxCount":
        1,
        "MinCount":
        1,
    }
    extra_volume_size_mapping = [{
        "DeviceName": "/dev/sda1",
        "Ebs": {
            "VolumeSize": 300,
        }
    }]
    if ("benchmark" in os.getenv("TEST_TYPE") and "mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or \
            ("tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name):
        params["BlockDeviceMappings"] = extra_volume_size_mapping
    instances = ec2_resource.create_instances(**params)
    instance_id = instances[0].id

    # Define finalizer to terminate instance after this fixture completes
    def terminate_ec2_instance():
        ec2_client.terminate_instances(InstanceIds=[instance_id])
        if test_utils.is_pr_context():
            test_utils.destroy_ssh_keypair(ec2_client, key_filename)
        else:
            with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys:
                destroy_keys.write(f"{key_filename}\n")

    request.addfinalizer(terminate_ec2_instance)

    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)
    return instance_id, key_filename
Beispiel #2
0
def attach_ecs_worker_node(worker_instance_type,
                           ami_id,
                           cluster_name,
                           cluster_arn=None,
                           region=DEFAULT_REGION,
                           worker_eia_capable=False):
    """
    Launch a worker instance in a cluster.
    :param worker_instance_type:
    :param ami_id:
    :param cluster_name:
    :param cluster_arn:
    :param region:
    :return: <tuple> instance_id, public_ip_address
    """
    ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config"

    sts_client = boto3.client('sts')
    account_id = sts_client.get_caller_identity().get('Account')
    ecs_role_name = "ecsInstanceRole"
    ecs_instance_role_arn = f"arn:aws:iam::{account_id}:instance-profile/{ecs_role_name}"

    instc = ec2_utils.launch_instance(
        ami_id,
        region=region,
        instance_type=worker_instance_type,
        user_data=ecs_user_data,
        iam_instance_profile_arn=ecs_instance_role_arn,
        instance_name=f"ecs worker {cluster_name}",
        eia_capable=worker_eia_capable)

    instance_id = instc["InstanceId"]
    public_ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)

    list_container_filter = (
        f"ec2InstanceId in ['{instance_id}'] and agentConnected==true")
    if cluster_arn is None:
        cluster_arn = cluster_name
    container_arns = list_ecs_container_instances(cluster_arn,
                                                  list_container_filter,
                                                  "ACTIVE", region)

    if not container_arns:
        raise Exception(
            f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}"
        )
    return instance_id, public_ip_address
Beispiel #3
0
def attach_ecs_worker_node(worker_instance_type,
                           ami_id,
                           cluster_name,
                           cluster_arn=None,
                           region=DEFAULT_REGION):
    """
    Launch a worker instance in a cluster.
    :param worker_instance_type:
    :param ami_id:
    :param cluster_name:
    :param cluster_arn:
    :param region:
    :return: <tuple> instance_id, public_ip_address
    """
    ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config"

    instc = ec2_utils.launch_instance(
        ami_id,
        region=region,
        instance_type=worker_instance_type,
        user_data=ecs_user_data,
        iam_instance_profile_arn=ECS_INSTANCE_ROLE_ARN,
        instance_name=f"ecs worker {cluster_name}",
    )

    instance_id = instc["InstanceId"]
    public_ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)

    list_container_filter = (
        f"ec2InstanceId in ['{instance_id}'] and agentConnected==true")
    if cluster_arn is None:
        cluster_arn = cluster_name
    container_arns = list_ecs_container_instances(cluster_arn,
                                                  list_container_filter,
                                                  "ACTIVE", region)

    if not container_arns:
        raise Exception(
            f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}"
        )
    return instance_id, public_ip_address
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type,
                 ec2_key_name, ec2_instance_role_name, ec2_instance_ami,
                 region):
    print(f"Creating instance: CI-CD {ec2_key_name}")
    key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
    instances = ec2_resource.create_instances(
        KeyName=ec2_key_name,
        ImageId=ec2_instance_ami,
        InstanceType=ec2_instance_type,
        IamInstanceProfile={"Name": ec2_instance_role_name},
        TagSpecifications=[
            {
                "ResourceType": "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": f"CI-CD {ec2_key_name}"
                }]
            },
        ],
        MaxCount=1,
        MinCount=1,
    )
    instance_id = instances[0].id

    # Define finalizer to terminate instance after this fixture completes
    def terminate_ec2_instance():
        ec2_client.terminate_instances(InstanceIds=[instance_id])
        if test_utils.is_pr_context():
            test_utils.destroy_ssh_keypair(ec2_client, key_filename)
        else:
            with open(KEYS_TO_DESTROY_FILE, 'a') as destroy_keys:
                destroy_keys.write(f"{key_filename}\n")

    request.addfinalizer(terminate_ec2_instance)

    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)
    return instance_id, key_filename
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type,
                 ec2_key_name, ec2_instance_role_name, ec2_instance_ami,
                 region, ei_accelerator_type):
    if ec2_instance_type == "p3dn.24xlarge":
        region = P3DN_REGION
        ec2_client = boto3.client("ec2",
                                  region_name=region,
                                  config=Config(retries={"max_attempts": 10}))
        ec2_resource = boto3.resource(
            "ec2",
            region_name=region,
            config=Config(retries={"max_attempts": 10}))
        if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1:
            ec2_instance_ami = UBUNTU_18_BASE_DLAMI_US_EAST_1
    print(f"Creating instance: CI-CD {ec2_key_name}")
    key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)

    def delete_ssh_keypair():
        if test_utils.is_pr_context():
            test_utils.destroy_ssh_keypair(ec2_client, key_filename)
        else:
            with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys:
                destroy_keys.write(f"{key_filename}\n")

    request.addfinalizer(delete_ssh_keypair)

    params = {
        "KeyName":
        ec2_key_name,
        "ImageId":
        ec2_instance_ami,
        "InstanceType":
        ec2_instance_type,
        "IamInstanceProfile": {
            "Name": ec2_instance_role_name
        },
        "TagSpecifications": [
            {
                "ResourceType": "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": f"CI-CD {ec2_key_name}"
                }]
            },
        ],
        "MaxCount":
        1,
        "MinCount":
        1,
    }
    extra_volume_size_mapping = [{
        "DeviceName": "/dev/sda1",
        "Ebs": {
            "VolumeSize": 300,
        }
    }]
    if (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context())
            and (("mxnet_training" in request.fixturenames
                  and "gpu_only" in request.fixturenames)
                 or "mxnet_inference" in request.fixturenames)) or (
                     "tensorflow_training" in request.fixturenames
                     and "gpu_only" in request.fixturenames
                     and "horovod" in ec2_key_name):
        params["BlockDeviceMappings"] = extra_volume_size_mapping
    if ei_accelerator_type:
        params["ElasticInferenceAccelerators"] = [{
            'Type': ei_accelerator_type,
            'Count': 1
        }]
        availability_zones = {
            "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"],
            "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"]
        }
        for a_zone in availability_zones[region]:
            params["Placement"] = {'AvailabilityZone': a_zone}
            try:
                instances = ec2_resource.create_instances(**params)
                if instances:
                    break
            except ClientError as e:
                LOGGER.error(f"Failed to launch in {a_zone} with Error: {e}")
                continue
    else:
        try:
            instances = ec2_resource.create_instances(**params)
        except ClientError as e:
            if e.response['Error']['Code'] == "InsufficientInstanceCapacity":
                LOGGER.warning(
                    f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity"
                )
                if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST:
                    pytest.skip(
                        f"Skipping test because {ec2_instance_type} instance could not be launched."
                    )
            raise
    instance_id = instances[0].id

    # Define finalizer to terminate instance after this fixture completes
    def terminate_ec2_instance():
        ec2_client.terminate_instances(InstanceIds=[instance_id])

    request.addfinalizer(terminate_ec2_instance)

    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)
    return instance_id, key_filename
def ec2_instance(
    request,
    ec2_client,
    ec2_resource,
    ec2_instance_type,
    ec2_key_name,
    ec2_instance_role_name,
    ec2_instance_ami,
    region,
    ei_accelerator_type,
):
    if ec2_instance_type == "p3dn.24xlarge":
        region = P3DN_REGION
        ec2_client = boto3.client("ec2",
                                  region_name=region,
                                  config=Config(retries={"max_attempts": 10}))
        ec2_resource = boto3.resource(
            "ec2",
            region_name=region,
            config=Config(retries={"max_attempts": 10}))
        if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1:
            ec2_instance_ami = (AML2_GPU_DLAMI_US_EAST_1
                                if ec2_instance_ami == AML2_GPU_DLAMI_US_WEST_2
                                else UBUNTU_18_BASE_DLAMI_US_EAST_1)

    print(f"Creating instance: CI-CD {ec2_key_name}")
    key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)

    def delete_ssh_keypair():
        if test_utils.is_pr_context():
            test_utils.destroy_ssh_keypair(ec2_client, key_filename)
        else:
            with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys:
                destroy_keys.write(f"{key_filename}\n")

    request.addfinalizer(delete_ssh_keypair)

    params = {
        "KeyName":
        ec2_key_name,
        "ImageId":
        ec2_instance_ami,
        "InstanceType":
        ec2_instance_type,
        "IamInstanceProfile": {
            "Name": ec2_instance_role_name
        },
        "TagSpecifications": [
            {
                "ResourceType": "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": f"CI-CD {ec2_key_name}"
                }]
            },
        ],
        "MaxCount":
        1,
        "MinCount":
        1,
    }

    volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda"

    if ("pytorch_training_habana" in request.fixturenames
            or "tensorflow_training_habana" in request.fixturenames
            or "hpu" in request.fixturenames):
        user_data = """#!/bin/bash
        sudo apt-get update && sudo apt-get install -y awscli"""
        params["UserData"] = user_data
        params["BlockDeviceMappings"] = [{
            "DeviceName": volume_name,
            "Ebs": {
                "VolumeSize": 1000,
            },
        }]
    elif (
        (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context())
         and (("mxnet_training" in request.fixturenames
               and "gpu_only" in request.fixturenames)
              or "mxnet_inference" in request.fixturenames))
            or (is_neuron_image) or
        ("tensorflow_training" in request.fixturenames
         and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name)
            or ("tensorflow_inference" in request.fixturenames
                and "graviton_compatible_only" in request.fixturenames)
            or ("graviton" in request.fixturenames)):
        params["BlockDeviceMappings"] = [{
            "DeviceName": volume_name,
            "Ebs": {
                "VolumeSize": 300,
            },
        }]
    else:
        # Using private AMI, the EBS volume size is reduced to 28GB as opposed to 50GB from public AMI. This leads to space issues on test instances
        # TODO: Revert the configuration once DLAMI is public
        params["BlockDeviceMappings"] = [{
            "DeviceName": volume_name,
            "Ebs": {
                "VolumeSize": 90,
            },
        }]
    if ei_accelerator_type:
        params["ElasticInferenceAccelerators"] = [{
            "Type": ei_accelerator_type,
            "Count": 1
        }]
        availability_zones = {
            "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"],
            "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"],
        }
        for a_zone in availability_zones[region]:
            params["Placement"] = {"AvailabilityZone": a_zone}
            try:
                instances = ec2_resource.create_instances(**params)
                if instances:
                    break
            except ClientError as e:
                LOGGER.error(f"Failed to launch in {a_zone} due to {e}")
                continue
    else:
        try:
            instances = ec2_resource.create_instances(**params)
        except ClientError as e:
            if e.response["Error"]["Code"] == "InsufficientInstanceCapacity":
                LOGGER.warning(
                    f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity"
                )
                if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST:
                    pytest.skip(
                        f"Skipping test because {ec2_instance_type} instance could not be launched."
                    )
            raise
    instance_id = instances[0].id

    # Define finalizer to terminate instance after this fixture completes
    def terminate_ec2_instance():
        ec2_client.terminate_instances(InstanceIds=[instance_id])

    request.addfinalizer(terminate_ec2_instance)

    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)
    return instance_id, key_filename