Exemple #1
0
def test_ecs_tensorflow_inference_gpu(tensorflow_inference,
                                      ecs_container_instance, region,
                                      gpu_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)
    num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id)

    model_name = "saved_model_half_plus_two"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            tensorflow_inference,
            "tensorflow",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_gpus=num_gpus,
            region=region)
        model_name = get_tensorflow_model_name("gpu", model_name)
        inference_result = request_tensorflow_inference(
            model_name, ip_address=public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
def test_ecs_pytorch_inference_eia(pytorch_inference_eia,
                                   ecs_container_instance, ei_accelerator_type,
                                   region, eia_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)

    model_name = "pytorch-densenet"
    image_framework, image_framework_version = get_framework_and_version_from_tag(
        pytorch_inference_eia)
    if image_framework_version == "1.3.1":
        model_name = "pytorch-densenet-v1-3-1"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            pytorch_inference_eia,
            "pytorch",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            ei_accelerator_type,
            region=region)
        inference_result = request_pytorch_inference_densenet(
            public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
def test_ecs_pytorch_inference_gpu(pytorch_inference, ecs_container_instance,
                                   region, gpu_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)
    num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id,
                                               region=region)

    model_name = "pytorch-densenet"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            pytorch_inference,
            "pytorch",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_gpus=num_gpus,
            region=region)
        inference_result = request_pytorch_inference_densenet(
            public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
Exemple #4
0
def ec2_connection(request, ec2_instance, ec2_key_name, region):
    """
    Fixture to establish connection with EC2 instance if necessary
    :param request: pytest test request
    :param ec2_instance: ec2_instance pytest fixture
    :param ec2_key_name: unique key name
    :param region: Region where ec2 instance is launched
    :return: Fabric connection object
    """
    instance_id, instance_pem_file = ec2_instance
    LOGGER.info(
        f"Instance ip_address: {ec2_utils.get_public_ip(instance_id, region)}")
    user = ec2_utils.get_instance_user(instance_id, region=region)
    conn = Connection(user=user,
                      host=ec2_utils.get_public_ip(instance_id, region),
                      connect_kwargs={"key_filename": [instance_pem_file]})

    artifact_folder = f"{ec2_key_name}-folder"
    s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder)

    def delete_s3_artifact_copy():
        test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location)

    request.addfinalizer(delete_s3_artifact_copy)

    conn.run(
        f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{ec2_key_name}-folder $HOME/container_tests"
    )
    conn.run(
        f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*"
    )

    return conn
Exemple #5
0
def test_ecs_mxnet_inference_eia(mxnet_inference_eia, ecs_container_instance,
                                 ei_accelerator_type, region, eia_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)

    model_name = "resnet-152-eia"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            mxnet_inference_eia,
            "mxnet",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            ei_accelerator_type,
            region=region,
        )
        inference_result = request_mxnet_inference(public_ip_address,
                                                   model="resnet-152-eia")
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
def test_ecs_pytorch_inference_neuron(pytorch_inference_neuron,
                                      ecs_container_instance, region):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)
    num_neurons = ec2_utils.get_instance_num_inferentias(worker_instance_id,
                                                         region=region)

    model_name = "pytorch-resnet-neuron"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            pytorch_inference_neuron,
            "pytorch",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_neurons=num_neurons,
            region=region)
        server_type = get_inference_server_type(pytorch_inference_neuron)
        inference_result = request_pytorch_inference_densenet(
            public_ip_address, server_type=server_type, model_name=model_name)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
Exemple #7
0
def attach_ecs_worker_node(worker_instance_type,
                           ami_id,
                           cluster_name,
                           cluster_arn=None,
                           region=DEFAULT_REGION,
                           worker_eia_capable=False):
    """
    Launch a worker instance in a cluster.
    :param worker_instance_type:
    :param ami_id:
    :param cluster_name:
    :param cluster_arn:
    :param region:
    :return: <tuple> instance_id, public_ip_address
    """
    ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config"

    sts_client = boto3.client('sts')
    account_id = sts_client.get_caller_identity().get('Account')
    ecs_role_name = "ecsInstanceRole"
    ecs_instance_role_arn = f"arn:aws:iam::{account_id}:instance-profile/{ecs_role_name}"

    instc = ec2_utils.launch_instance(
        ami_id,
        region=region,
        instance_type=worker_instance_type,
        user_data=ecs_user_data,
        iam_instance_profile_arn=ecs_instance_role_arn,
        instance_name=f"ecs worker {cluster_name}",
        eia_capable=worker_eia_capable)

    instance_id = instc["InstanceId"]
    public_ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)

    list_container_filter = (
        f"ec2InstanceId in ['{instance_id}'] and agentConnected==true")
    if cluster_arn is None:
        cluster_arn = cluster_name
    container_arns = list_ecs_container_instances(cluster_arn,
                                                  list_container_filter,
                                                  "ACTIVE", region)

    if not container_arns:
        raise Exception(
            f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}"
        )
    return instance_id, public_ip_address
def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type,
                   region):
    """
    Fixture to establish connection with EC2 instance if necessary
    :param request: pytest test request
    :param ec2_instance: ec2_instance pytest fixture
    :param ec2_key_name: unique key name
    :param ec2_instance_type: ec2_instance_type pytest fixture
    :param region: Region where ec2 instance is launched
    :return: Fabric connection object
    """
    instance_id, instance_pem_file = ec2_instance
    region = P3DN_REGION if ec2_instance_type == "p3dn.24xlarge" else region
    ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    LOGGER.info(f"Instance ip_address: {ip_address}")
    user = ec2_utils.get_instance_user(instance_id, region=region)
    LOGGER.info(f"Connecting to {user}@{ip_address}")
    conn = Connection(
        user=user,
        host=ip_address,
        connect_kwargs={"key_filename": [instance_pem_file]},
        connect_timeout=18000,
    )

    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 100000)

    artifact_folder = f"{ec2_key_name}-{unique_id}-folder"
    s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder)

    def delete_s3_artifact_copy():
        test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location)

    request.addfinalizer(delete_s3_artifact_copy)

    conn.run(
        f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} $HOME/container_tests"
    )
    conn.run(
        f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*"
    )

    # Log into ECR if we are in canary context
    if test_utils.is_canary_context():
        public_registry = test_utils.PUBLIC_DLC_REGISTRY
        test_utils.login_to_ecr_registry(conn, public_registry, region)

    return conn
Exemple #9
0
def __ecs_tensorflow_inference_cpu_nlp(tensorflow_inference, ecs_container_instance, region):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region)

    model_name = "albert"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            tensorflow_inference, "tensorflow", ecs_cluster_arn, model_name, worker_instance_id, region=region
        )
        model_name = get_tensorflow_model_name("cpu", model_name)
        inference_result = request_tensorflow_inference_nlp(model_name, ip_address=public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
Exemple #10
0
def attach_ecs_worker_node(worker_instance_type,
                           ami_id,
                           cluster_name,
                           cluster_arn=None,
                           region=DEFAULT_REGION):
    """
    Launch a worker instance in a cluster.
    :param worker_instance_type:
    :param ami_id:
    :param cluster_name:
    :param cluster_arn:
    :param region:
    :return: <tuple> instance_id, public_ip_address
    """
    ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config"

    instc = ec2_utils.launch_instance(
        ami_id,
        region=region,
        instance_type=worker_instance_type,
        user_data=ecs_user_data,
        iam_instance_profile_arn=ECS_INSTANCE_ROLE_ARN,
        instance_name=f"ecs worker {cluster_name}",
    )

    instance_id = instc["InstanceId"]
    public_ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)

    list_container_filter = (
        f"ec2InstanceId in ['{instance_id}'] and agentConnected==true")
    if cluster_arn is None:
        cluster_arn = cluster_name
    container_arns = list_ecs_container_instances(cluster_arn,
                                                  list_container_filter,
                                                  "ACTIVE", region)

    if not container_arns:
        raise Exception(
            f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}"
        )
    return instance_id, public_ip_address
Exemple #11
0
def test_ecs_mxnet_inference_cpu(mxnet_inference, ecs_container_instance,
                                 region, cpu_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)

    model_name = "squeezenet"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            mxnet_inference,
            "mxnet",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            region=region)
        inference_result = request_mxnet_inference(public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
Exemple #12
0
def test_ecs_tensorflow_inference_neuron(tensorflow_inference_neuron, ecs_container_instance, region):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region)
    num_neurons = ec2_utils.get_instance_num_inferentias(worker_instance_id)

    model_name = "simple"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            tensorflow_inference_neuron,
            "tensorflow",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_neurons=num_neurons,
            region=region,
        )
        model_name = get_tensorflow_model_name("neuron", model_name)
        inference_result = request_tensorflow_inference(model_name, ip_address=public_ip_address, inference_string="'{\"instances\": [[1.0, 2.0, 5.0]]}'")
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)