def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): repo_name, image_tag = image_uri.split("/")[-1].split(":") container_name = f"{repo_name}-{image_tag}-ec2" model_name = "pytorch-densenet" mms_inference_cmd = test_utils.get_mms_run_command(model_name, processor) docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker" docker_run_cmd = (f"{docker_cmd} run -itd --name {container_name}" f" -p 80:8080 -p 8081:8081" f" {image_uri} {mms_inference_cmd}") try: ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) inference_result = test_utils.request_pytorch_inference_densenet( connection=ec2_connection) assert ( inference_result ), f"Failed to perform pytorch inference test for image: {image_uri} on ec2" finally: ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
def host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection, is_neuron, model_name): # Tensorflow 1.x doesn't have package with version 1.15.2 so use only 1.15 ec2_connection.run( ( f"pip3 install --user -qq -U 'tensorflow<={framework_version}' " f" 'tensorflow-serving-api<={framework_version}' " ), hide=True ) if os.path.exists(f"{serving_folder_path}"): ec2_connection.run(f"rm -rf {serving_folder_path}") if str(framework_version).startswith(TENSORFLOW1_VERSION): run_out = ec2_connection.run( f"git clone https://github.com/tensorflow/serving.git {serving_folder_path}" ) git_branch_version = re.findall(r"[1-2]\.[0-9]\d", framework_version)[0] ec2_connection.run( f"cd {serving_folder_path} && git checkout r{git_branch_version}" ) LOGGER.info(f"Clone TF serving repository status {run_out.return_code == 0}") if is_neuron: container_test_local_file = os.path.join("$HOME", "container_tests/bin/neuron_tests/mnist_client.py") ec2_connection.run(f"cp -f {container_test_local_file} {serving_folder_path}/tensorflow_serving/example") neuron_model_file_path = os.path.join(serving_folder_path, f"models/{model_name}/1") neuron_model_file = os.path.join(neuron_model_file_path, "saved_model.pb") LOGGER.info(f"Host Model path {neuron_model_file_path}") ec2_connection.run(f"mkdir -p {neuron_model_file_path}") model_file_path = f"https://aws-dlc-sample-models.s3.amazonaws.com/{model_name}_neuron/1/saved_model.pb" model_download = ( f"wget -O {neuron_model_file} {model_file_path} " ) ec2_connection.run(model_download) else: local_scripts_path = os.path.join("container_tests", "bin", "tensorflow_serving") ec2_connection.run(f"mkdir -p {serving_folder_path}") ec2_connection.run(f"cp -r {local_scripts_path} {serving_folder_path}")
def run_ec2_mxnet_inference(image_uri, model_name, container_tag, ec2_connection, processor, region, target_port, target_management_port): repo_name, image_tag = image_uri.split("/")[-1].split(":") container_name = f"{repo_name}-{image_tag}-ec2-{container_tag}" docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker" mms_inference_cmd = test_utils.get_inference_run_command(image_uri, model_name, processor) docker_run_cmd = ( f"{docker_cmd} run -itd --name {container_name}" f" -p {target_port}:8080 -p {target_management_port}:8081" f" {image_uri} {mms_inference_cmd}" ) try: ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True ) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) if model_name == SQUEEZENET_MODEL: inference_result = test_utils.request_mxnet_inference( port=target_port, connection=ec2_connection, model="squeezenet" ) elif model_name == BERT_MODEL: inference_result = test_utils.request_mxnet_inference_gluonnlp( port=target_port, connection=ec2_connection ) elif model_name == RESNET_EIA_MODEL: inference_result = test_utils.request_mxnet_inference( port=target_port, connection=ec2_connection, model=model_name ) assert ( inference_result ), f"Failed to perform mxnet {model_name} inference test for image: {image_uri} on ec2" finally: ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
def run_ec2_tensorflow_inference(image_uri, ec2_connection, grpc_port, region): repo_name, image_tag = image_uri.split("/")[-1].split(":") container_name = f"{repo_name}-{image_tag}-ec2" framework_version = get_tensorflow_framework_version(image_uri) home_dir = ec2_connection.run("echo $HOME").stdout.strip('\n') serving_folder_path = os.path.join(home_dir, "serving") model_path = os.path.join(serving_folder_path, "models", "mnist") mnist_client_path = os.path.join(serving_folder_path, "tensorflow_serving", "example", "mnist_client.py") docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker" docker_run_cmd = ( f"{docker_cmd} run -id --name {container_name} -p {grpc_port}:8500 " f"--mount type=bind,source={model_path},target=/models/mnist -e MODEL_NAME=mnist" f" {image_uri}") try: host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection) sleep(2) train_mnist_model(serving_folder_path, ec2_connection) sleep(10) ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) sleep(20) test_utils.request_tensorflow_inference_grpc( script_file_path=mnist_client_path, port=grpc_port, connection=ec2_connection) finally: ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
def run_ec2_tensorflow_inference(image_uri, ec2_connection, ec2_instance_ami, grpc_port, region, telemetry_mode=False): repo_name, image_tag = image_uri.split("/")[-1].split(":") container_name = f"{repo_name}-{image_tag}-ec2" framework_version = get_tensorflow_framework_version(image_uri) home_dir = ec2_connection.run("echo $HOME").stdout.strip('\n') serving_folder_path = os.path.join(home_dir, "serving") model_path = os.path.join(serving_folder_path, "models", "mnist") python_invoker = test_utils.get_python_invoker(ec2_instance_ami) mnist_client_path = os.path.join(serving_folder_path, "tensorflow_serving", "example", "mnist_client.py") is_neuron = "neuron" in image_uri docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker" if is_neuron: docker_run_cmd = ( f"{docker_cmd} run -id --name {container_name} -p {grpc_port}:8500 " f"--device=/dev/neuron0 --net=host --cap-add IPC_LOCK " f"-e NEURON_MONITOR_CW_REGION=us-east-1 -e NEURON_MONITOR_CW_NAMESPACE=tf1 " f"--mount type=bind,source={model_path},target=/models/mnist -e TEST_MODE=1 -e MODEL_NAME=mnist" f" {image_uri}") else: docker_run_cmd = ( f"{docker_cmd} run -id --name {container_name} -p {grpc_port}:8500 " f"--mount type=bind,source={model_path},target=/models/mnist -e TEST_MODE=1 -e MODEL_NAME=mnist" f" {image_uri}") try: host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection, is_neuron, 'mnist', python_invoker) sleep(2) if not is_neuron: train_mnist_model(serving_folder_path, ec2_connection, python_invoker) sleep(10) ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) sleep(20) test_utils.request_tensorflow_inference_grpc( script_file_path=mnist_client_path, port=grpc_port, connection=ec2_connection, ec2_instance_ami=ec2_instance_ami) if telemetry_mode: check_telemetry(ec2_connection, container_name) finally: ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): repo_name, image_tag = image_uri.split("/")[-1].split(":") container_name = f"{repo_name}-{image_tag}-ec2" model_name = "pytorch-densenet" if processor == "eia": image_framework, image_framework_version = get_framework_and_version_from_tag( image_uri) if image_framework_version == "1.3.1": model_name = "pytorch-densenet-v1-3-1" if processor == "neuron": model_name = "pytorch-resnet-neuron" inference_cmd = test_utils.get_inference_run_command( image_uri, model_name, processor) docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker" if processor == "neuron": ec2_connection.run("sudo systemctl stop neuron-rtd" ) # Stop neuron-rtd in host env for DLC to start it docker_run_cmd = (f"{docker_cmd} run -itd --name {container_name}" f" -p 80:8080 -p 8081:8081" f" --device=/dev/neuron0 --cap-add IPC_LOCK" f" --env NEURON_MONITOR_CW_REGION={region}" f" {image_uri} {inference_cmd}") else: docker_run_cmd = (f"{docker_cmd} run -itd --name {container_name}" f" -p 80:8080 -p 8081:8081" f" {image_uri} {inference_cmd}") try: ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) server_type = get_inference_server_type(image_uri) inference_result = test_utils.request_pytorch_inference_densenet( connection=ec2_connection, model_name=model_name, server_type=server_type) assert ( inference_result ), f"Failed to perform pytorch inference test for image: {image_uri} on ec2" finally: ec2_connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
def host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection): # Tensorflow 1.x doesn't have package with version 1.15.2 so use only 1.15 ec2_connection.run( (f"pip install --user -qq -U 'tensorflow<={framework_version}' " f" 'tensorflow-serving-api<={framework_version}'"), hide=True) if os.path.exists(f"{serving_folder_path}"): ec2_connection.run(f"rm -rf {serving_folder_path}") if str(framework_version).startswith(TENSORFLOW1_VERSION): run_out = ec2_connection.run( f"git clone https://github.com/tensorflow/serving.git {serving_folder_path}" ) git_branch_version = re.findall(r"[1-2]\.[0-9]\d", framework_version)[0] ec2_connection.run( f"cd {serving_folder_path} && git checkout r{git_branch_version}") LOGGER.info( f"Clone TF serving repository status {run_out.return_code == 0}") else: local_scripts_path = os.path.join("container_tests", "bin", "tensorflow_serving") ec2_connection.run(f"mkdir -p {serving_folder_path}") ec2_connection.run(f"cp -r {local_scripts_path} {serving_folder_path}")
def host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection, is_neuron, is_graviton, model_name, python_invoker): # Tensorflow 1.x doesn't have package with version 1.15.2 so use only 1.15 if is_graviton: # TF training binary is used that is compatible for graviton instance type TF_URL = "https://aws-dlc-graviton-training-binaries.s3.us-west-2.amazonaws.com/tensorflow/2.6.0/tensorflow-2.6.0-cp38-cp38-linux_aarch64.whl" ec2_connection.run( (f"{python_invoker} -m pip install --no-cache-dir -U {TF_URL}"), hide=True) ec2_connection.run(( f"{python_invoker} -m pip install --no-dependencies --no-cache-dir tensorflow-serving-api=={framework_version}" ), hide=True) else: ec2_connection.run(( f"{python_invoker} -m pip install --user -qq -U 'tensorflow<={framework_version}' " f" 'tensorflow-serving-api<={framework_version}' "), hide=True) if os.path.exists(f"{serving_folder_path}"): ec2_connection.run(f"rm -rf {serving_folder_path}") if str(framework_version).startswith(TENSORFLOW1_VERSION): run_out = ec2_connection.run( f"git clone https://github.com/tensorflow/serving.git {serving_folder_path}" ) git_branch_version = re.findall(r"[1-2]\.[0-9]\d", framework_version)[0] ec2_connection.run( f"cd {serving_folder_path} && git checkout r{git_branch_version}") LOGGER.info( f"Clone TF serving repository status {run_out.return_code == 0}") if is_neuron: container_test_local_file = os.path.join( "$HOME", "container_tests/bin/neuron_tests/mnist_client.py") ec2_connection.run( f"cp -f {container_test_local_file} {serving_folder_path}/tensorflow_serving/example" ) neuron_model_file_path = os.path.join(serving_folder_path, f"models/{model_name}/1") neuron_model_file = os.path.join(neuron_model_file_path, "saved_model.pb") LOGGER.info(f"Host Model path {neuron_model_file_path}") ec2_connection.run(f"mkdir -p {neuron_model_file_path}") model_file_path = f"https://aws-dlc-sample-models.s3.amazonaws.com/{model_name}_neuron/1/saved_model.pb" model_download = ( f"wget -O {neuron_model_file} {model_file_path} ") ec2_connection.run(model_download) else: local_scripts_path = os.path.join("container_tests", "bin", "tensorflow_serving") ec2_connection.run(f"mkdir -p {serving_folder_path}") ec2_connection.run(f"cp -r {local_scripts_path} {serving_folder_path}") if is_neuron: neuron_local_model = os.path.join("$HOME", "container_tests", "bin", "neuron_tests", "simple") neuron_model_dir = os.path.join(serving_folder_path, "models") neuron_model_file_path = os.path.join(serving_folder_path, "models", "model_name", "1") LOGGER.info(f"Host Model path {neuron_model_file_path}") LOGGER.info(f"Host Model Dir {neuron_model_dir}") ec2_connection.run(f"mkdir -p {neuron_model_file_path}") ec2_connection.run( f"cp -r {neuron_local_model} {neuron_model_dir}")