def test_ecs_tensorflow_inference_gpu(tensorflow_inference, ecs_container_instance, region, gpu_only): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id) model_name = "saved_model_half_plus_two" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( tensorflow_inference, "tensorflow", ecs_cluster_arn, model_name, worker_instance_id, num_gpus=num_gpus, region=region) model_name = get_tensorflow_model_name("gpu", model_name) inference_result = request_tensorflow_inference( model_name, ip_address=public_ip_address) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def test_ecs_pytorch_inference_eia(pytorch_inference_eia, ecs_container_instance, ei_accelerator_type, region, eia_only): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) model_name = "pytorch-densenet" image_framework, image_framework_version = get_framework_and_version_from_tag( pytorch_inference_eia) if image_framework_version == "1.3.1": model_name = "pytorch-densenet-v1-3-1" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( pytorch_inference_eia, "pytorch", ecs_cluster_arn, model_name, worker_instance_id, ei_accelerator_type, region=region) inference_result = request_pytorch_inference_densenet( public_ip_address) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def test_ecs_pytorch_inference_gpu(pytorch_inference, ecs_container_instance, region, gpu_only): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id, region=region) model_name = "pytorch-densenet" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( pytorch_inference, "pytorch", ecs_cluster_arn, model_name, worker_instance_id, num_gpus=num_gpus, region=region) inference_result = request_pytorch_inference_densenet( public_ip_address) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def ec2_connection(request, ec2_instance, ec2_key_name, region): """ Fixture to establish connection with EC2 instance if necessary :param request: pytest test request :param ec2_instance: ec2_instance pytest fixture :param ec2_key_name: unique key name :param region: Region where ec2 instance is launched :return: Fabric connection object """ instance_id, instance_pem_file = ec2_instance LOGGER.info( f"Instance ip_address: {ec2_utils.get_public_ip(instance_id, region)}") user = ec2_utils.get_instance_user(instance_id, region=region) conn = Connection(user=user, host=ec2_utils.get_public_ip(instance_id, region), connect_kwargs={"key_filename": [instance_pem_file]}) artifact_folder = f"{ec2_key_name}-folder" s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder) def delete_s3_artifact_copy(): test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location) request.addfinalizer(delete_s3_artifact_copy) conn.run( f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{ec2_key_name}-folder $HOME/container_tests" ) conn.run( f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*" ) return conn
def test_ecs_mxnet_inference_eia(mxnet_inference_eia, ecs_container_instance, ei_accelerator_type, region, eia_only): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) model_name = "resnet-152-eia" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( mxnet_inference_eia, "mxnet", ecs_cluster_arn, model_name, worker_instance_id, ei_accelerator_type, region=region, ) inference_result = request_mxnet_inference(public_ip_address, model="resnet-152-eia") assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def test_ecs_pytorch_inference_neuron(pytorch_inference_neuron, ecs_container_instance, region): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) num_neurons = ec2_utils.get_instance_num_inferentias(worker_instance_id, region=region) model_name = "pytorch-resnet-neuron" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( pytorch_inference_neuron, "pytorch", ecs_cluster_arn, model_name, worker_instance_id, num_neurons=num_neurons, region=region) server_type = get_inference_server_type(pytorch_inference_neuron) inference_result = request_pytorch_inference_densenet( public_ip_address, server_type=server_type, model_name=model_name) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def attach_ecs_worker_node(worker_instance_type, ami_id, cluster_name, cluster_arn=None, region=DEFAULT_REGION, worker_eia_capable=False): """ Launch a worker instance in a cluster. :param worker_instance_type: :param ami_id: :param cluster_name: :param cluster_arn: :param region: :return: <tuple> instance_id, public_ip_address """ ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config" sts_client = boto3.client('sts') account_id = sts_client.get_caller_identity().get('Account') ecs_role_name = "ecsInstanceRole" ecs_instance_role_arn = f"arn:aws:iam::{account_id}:instance-profile/{ecs_role_name}" instc = ec2_utils.launch_instance( ami_id, region=region, instance_type=worker_instance_type, user_data=ecs_user_data, iam_instance_profile_arn=ecs_instance_role_arn, instance_name=f"ecs worker {cluster_name}", eia_capable=worker_eia_capable) instance_id = instc["InstanceId"] public_ip_address = ec2_utils.get_public_ip(instance_id, region=region) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) list_container_filter = ( f"ec2InstanceId in ['{instance_id}'] and agentConnected==true") if cluster_arn is None: cluster_arn = cluster_name container_arns = list_ecs_container_instances(cluster_arn, list_container_filter, "ACTIVE", region) if not container_arns: raise Exception( f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}" ) return instance_id, public_ip_address
def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type, region): """ Fixture to establish connection with EC2 instance if necessary :param request: pytest test request :param ec2_instance: ec2_instance pytest fixture :param ec2_key_name: unique key name :param ec2_instance_type: ec2_instance_type pytest fixture :param region: Region where ec2 instance is launched :return: Fabric connection object """ instance_id, instance_pem_file = ec2_instance region = P3DN_REGION if ec2_instance_type == "p3dn.24xlarge" else region ip_address = ec2_utils.get_public_ip(instance_id, region=region) LOGGER.info(f"Instance ip_address: {ip_address}") user = ec2_utils.get_instance_user(instance_id, region=region) LOGGER.info(f"Connecting to {user}@{ip_address}") conn = Connection( user=user, host=ip_address, connect_kwargs={"key_filename": [instance_pem_file]}, connect_timeout=18000, ) random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_id = random.randint(1, 100000) artifact_folder = f"{ec2_key_name}-{unique_id}-folder" s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder) def delete_s3_artifact_copy(): test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location) request.addfinalizer(delete_s3_artifact_copy) conn.run( f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} $HOME/container_tests" ) conn.run( f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*" ) # Log into ECR if we are in canary context if test_utils.is_canary_context(): public_registry = test_utils.PUBLIC_DLC_REGISTRY test_utils.login_to_ecr_registry(conn, public_registry, region) return conn
def __ecs_tensorflow_inference_cpu_nlp(tensorflow_inference, ecs_container_instance, region): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) model_name = "albert" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( tensorflow_inference, "tensorflow", ecs_cluster_arn, model_name, worker_instance_id, region=region ) model_name = get_tensorflow_model_name("cpu", model_name) inference_result = request_tensorflow_inference_nlp(model_name, ip_address=public_ip_address) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def attach_ecs_worker_node(worker_instance_type, ami_id, cluster_name, cluster_arn=None, region=DEFAULT_REGION): """ Launch a worker instance in a cluster. :param worker_instance_type: :param ami_id: :param cluster_name: :param cluster_arn: :param region: :return: <tuple> instance_id, public_ip_address """ ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config" instc = ec2_utils.launch_instance( ami_id, region=region, instance_type=worker_instance_type, user_data=ecs_user_data, iam_instance_profile_arn=ECS_INSTANCE_ROLE_ARN, instance_name=f"ecs worker {cluster_name}", ) instance_id = instc["InstanceId"] public_ip_address = ec2_utils.get_public_ip(instance_id, region=region) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) list_container_filter = ( f"ec2InstanceId in ['{instance_id}'] and agentConnected==true") if cluster_arn is None: cluster_arn = cluster_name container_arns = list_ecs_container_instances(cluster_arn, list_container_filter, "ACTIVE", region) if not container_arns: raise Exception( f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}" ) return instance_id, public_ip_address
def test_ecs_mxnet_inference_cpu(mxnet_inference, ecs_container_instance, region, cpu_only): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) model_name = "squeezenet" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( mxnet_inference, "mxnet", ecs_cluster_arn, model_name, worker_instance_id, region=region) inference_result = request_mxnet_inference(public_ip_address) assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)
def test_ecs_tensorflow_inference_neuron(tensorflow_inference_neuron, ecs_container_instance, region): worker_instance_id, ecs_cluster_arn = ecs_container_instance public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region) num_neurons = ec2_utils.get_instance_num_inferentias(worker_instance_id) model_name = "simple" service_name = task_family = revision = None try: service_name, task_family, revision = ecs_utils.setup_ecs_inference_service( tensorflow_inference_neuron, "tensorflow", ecs_cluster_arn, model_name, worker_instance_id, num_neurons=num_neurons, region=region, ) model_name = get_tensorflow_model_name("neuron", model_name) inference_result = request_tensorflow_inference(model_name, ip_address=public_ip_address, inference_string="'{\"instances\": [[1.0, 2.0, 5.0]]}'") assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}" finally: ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn, service_name, task_family, revision)