Esempio n. 1
0
def test_performance_ec2_pytorch_inference_graviton_cpu(
        pytorch_inference_graviton, ec2_connection, region, cpu_only):
    _, framework_version = get_framework_and_version_from_tag(
        pytorch_inference_graviton)
    threshold = get_threshold_for_image(framework_version,
                                        PYTORCH_INFERENCE_CPU_THRESHOLD)
    ec2_performance_pytorch_inference(pytorch_inference_graviton, "cpu",
                                      ec2_connection, region,
                                      PT_PERFORMANCE_INFERENCE_CPU_CMD,
                                      threshold)
Esempio n. 2
0
def test_performance_ec2_tensorflow_inference_graviton_cpu(
        tensorflow_inference_graviton, ec2_connection, ec2_instance_ami,
        region, cpu_only):
    _, framework_version = get_framework_and_version_from_tag(
        tensorflow_inference_graviton)
    threshold = get_threshold_for_image(framework_version,
                                        TENSORFLOW_INFERENCE_CPU_THRESHOLD)
    ec2_performance_tensorflow_inference(tensorflow_inference_graviton, "cpu",
                                         ec2_connection, ec2_instance_ami,
                                         region, threshold)
Esempio n. 3
0
def test_performance_tensorflow_gpu_imagenet(tensorflow_training, ec2_connection, gpu_only, tf2_only):
    _, framework_version = get_framework_and_version_from_tag(tensorflow_training)
    threshold = get_threshold_for_image(framework_version, TENSORFLOW_TRAINING_GPU_IMAGENET_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="imagenet",
        threshold={"Throughput": threshold},
    )
Esempio n. 4
0
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection, cpu_only):
    _, framework_version = get_framework_and_version_from_tag(tensorflow_training)
    threshold = get_threshold_for_image(framework_version, TENSORFLOW_TRAINING_CPU_SYNTHETIC_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="synthetic",
        threshold={"Throughput": threshold},
    )
Esempio n. 5
0
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference,
                                                  ec2_connection,
                                                  ec2_instance_ami, region,
                                                  cpu_only):
    _, framework_version = get_framework_and_version_from_tag(
        tensorflow_inference)
    if Version(framework_version) == Version("2.4.1"):
        pytest.skip("This test times out, and needs to be run manually.")
    threshold = get_threshold_for_image(framework_version,
                                        TENSORFLOW_INFERENCE_CPU_THRESHOLD)
    ec2_performance_tensorflow_inference(tensorflow_inference, "cpu",
                                         ec2_connection, ec2_instance_ami,
                                         region, threshold)
Esempio n. 6
0
def test_performance_ec2_mxnet_inference_cpu(mxnet_inference, ec2_connection,
                                             cpu_only, py3_only):
    _, framework_version = get_framework_and_version_from_tag(mxnet_inference)
    threshold = get_threshold_for_image(
        framework_version, MXNET_INFERENCE_CPU_IMAGENET_THRESHOLD)
    execute_ec2_inference_performance_test(
        ec2_connection,
        mxnet_inference,
        MX_PERFORMANCE_INFERENCE_CPU_CMD,
        post_process=post_process_mxnet_ec2_performance,
        data_source="imagenet",
        threshold={"Throughput": threshold},
    )
def test_performance_ec2_mxnet_training_cpu(mxnet_training, ec2_connection,
                                            cpu_only):
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    threshold = get_threshold_for_image(framework_version,
                                        MXNET_TRAINING_CPU_CIFAR_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        mxnet_training,
        MX_PERFORMANCE_TRAINING_CPU_CMD,
        post_process=post_process_mxnet_ec2_performance,
        data_source="cifar10",
        threshold={"Throughput": threshold},
    )
Esempio n. 8
0
def test_performance_pytorch_gpu_synthetic(pytorch_training, ec2_connection,
                                           gpu_only, py3_only):
    _, framework_version = get_framework_and_version_from_tag(pytorch_training)
    threshold = get_threshold_for_image(
        framework_version, PYTORCH_TRAINING_GPU_SYNTHETIC_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        pytorch_training,
        PT_PERFORMANCE_TRAINING_GPU_SYNTHETIC_CMD,
        post_process=
        post_process_pytorch_gpu_py3_synthetic_ec2_training_performance,
        data_source="synthetic",
        threshold={"Throughput": threshold},
    )
Esempio n. 9
0
def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test(
        connection, ecr_uri, test_cmd, region=DEFAULT_REGION):
    _, framework_version = get_framework_and_version_from_tag(ecr_uri)
    threshold = get_threshold_for_image(
        framework_version, PYTORCH_TRAINING_GPU_IMAGENET_THRESHOLD)
    repo_name, image_tag = ecr_uri.split("/")[-1].split(":")
    container_test_local_dir = os.path.join("$HOME", "container_tests")

    container_name = f"{repo_name}-performance-{image_tag}-ec2"

    # Make sure we are logged into ECR so we can pull the image
    connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)
    connection.run(f"nvidia-docker pull -q {ecr_uri}")
    timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
    log_name = f"imagenet_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt"
    log_location = os.path.join(container_test_local_dir, "benchmark", "logs",
                                log_name)
    # Run training command, display benchmark results to console
    try:
        connection.run(
            f"nvidia-docker run --user root "
            f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} "
            f"-e PR_CONTEXT={1 if is_pr_context() else 0} "
            f"--shm-size 8G --env OMP_NUM_THREADS=1 --name {container_name} "
            f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} "
            f"-v /home/ubuntu/:/root/:delegated "
            f"{ecr_uri} {os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}")
    finally:
        connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
    ec2_performance_upload_result_to_s3_and_validate(
        connection,
        ecr_uri,
        log_location,
        "imagenet",
        {"Cost": threshold},
        post_process_pytorch_gpu_py3_imagenet_ec2_training_performance,
        log_name,
    )
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
Esempio n. 11
0
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes,
                                              region, gpu_only, py3_only):
    """
    Run MX sagemaker training performance test

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs
    some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv.

    The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh
    The shell script sets num-epochs to 40. This parameter is configurable.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file
    TODO: Change latency [time/epoch] metric to Throughput metric

    :param mxnet_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
    py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
    ec2_instance_type = "p3.16xlarge"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet",
                                          framework_version, "sagemaker",
                                          "training", device_cuda_str,
                                          py_version)
    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 90m python mx_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {mxnet_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not run_out.ok:
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}",
        warn=True,
        echo=True)

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, time_val, accuracy = _print_results_of_test(
        os.path.join(test_dir, log_file))

    accuracy_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD)
    assert accuracy > accuracy_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}"
    )

    time_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD)
    assert time_val < time_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}"
    )