Example #1
0
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env,
                                    setup_tf2_env):
    """
    Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts

    :param resources_location: <str> directory in which test resources should be placed
    :param setup_tf1_env: <bool> True if tf1 resources need to be setup
    :param setup_tf2_env: <bool> True if tf2 resources need to be setup
    :return: absolute path to the location of the virtual environment
    """
    ctx = Context()

    tf_resource_dir_list = []
    if setup_tf1_env:
        tf_resource_dir_list.append("tensorflow1")
    if setup_tf2_env:
        tf_resource_dir_list.append("tensorflow2")

    for resource_dir in tf_resource_dir_list:
        with ctx.cd(os.path.join(resources_location, resource_dir)):
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir, "horovod")):
                # v0.19.4 is the last version for which horovod example tests are py2 compatible
                ctx.run(
                    "git clone -b v0.19.4 https://github.com/horovod/horovod.git"
                )
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir,
                                 "deep-learning-models")):
                # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files
                ctx.run(
                    f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git"
                )

    venv_dir = os.path.join(resources_location, "sm_benchmark_venv")
    if not os.path.isdir(venv_dir):
        ctx.run(f"virtualenv {venv_dir}")
        with ctx.prefix(f"source {venv_dir}/bin/activate"):
            ctx.run(
                "pip install 'sagemaker>=2,<3' awscli boto3 botocore six==1.11"
            )

            # SageMaker TF estimator is coded to only accept framework versions up to 2.1.0 as py2 compatible.
            # Fixing this through the following changes:
            estimator_location = ctx.run(
                "echo $(pip3 show sagemaker |grep 'Location' |sed s/'Location: '//g)/sagemaker/tensorflow/estimator.py"
            ).stdout.strip("\n")
            system = ctx.run("uname -s").stdout.strip("\n")
            sed_input_arg = "'' " if system == "Darwin" else ""
            ctx.run(
                f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}"
            )
    return venv_dir
Example #2
0
def daemon_runner(pytestconfig, data_dir, downloads_dir, working_dir):
    """
    Provide an invoke's `Local` object that has started the arduino-cli in daemon mode.
    This way is simple to start and kill the daemon when the test is finished
    via the kill() function

    Useful reference:
        http://docs.pyinvoke.org/en/1.4/api/runners.html#invoke.runners.Local
        http://docs.pyinvoke.org/en/1.4/api/runners.html
    """
    cli_full_line = str(Path(pytestconfig.rootdir).parent / "arduino-cli daemon")
    env = {
        "ARDUINO_DATA_DIR": data_dir,
        "ARDUINO_DOWNLOADS_DIR": downloads_dir,
        "ARDUINO_SKETCHBOOK_DIR": data_dir,
    }
    (Path(data_dir) / "packages").mkdir()
    run_context = Context()
    # It might happen that we need to change directories between drives on Windows,
    # in that case the "/d" flag must be used otherwise directory wouldn't change
    cd_command = "cd"
    if platform.system() == "Windows":
        cd_command += " /d"
    # Context.cd() is not used since it doesn't work correctly on Windows.
    # It escapes spaces in the path using "\ " but it doesn't always work,
    # wrapping the path in quotation marks is the safest approach
    run_context.prefix(f'{cd_command} "{working_dir}"')
    # Local Class is the implementation of a Runner abstract class
    runner = Local(run_context)
    runner.run(cli_full_line, echo=False, hide=True, warn=True, env=env, asynchronous=True)

    # we block here until the test function using this fixture has returned
    yield runner

    # Kill the runner's process as we finished our test (platform dependent)
    os_signal = signal.SIGTERM
    if platform.system() != "Windows":
        os_signal = signal.SIGKILL
    os.kill(runner.process.pid, os_signal)
Example #3
0
def test_sm_profiler_pt(pytorch_training):
    processor = get_processor_from_image_uri(pytorch_training)
    if processor not in ("cpu", "gpu"):
        pytest.skip(f"Processor {processor} not supported. Skipping test.")

    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(image_framework_version) in SpecifierSet(">=1.12"):
        pytest.skip("sm profiler ZCC test is not supported in PT 1.12 and above")

    ctx = Context()

    profiler_tests_dir = os.path.join(
        os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", pytorch_training), "smprofiler_tests"
    )
    ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True)

    # Download sagemaker-tests zip
    sm_tests_zip = "sagemaker-tests.zip"
    ctx.run(
        f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}",
        hide=True,
    )

    # PT test setup requirements
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        ctx.run(f"unzip {sm_tests_zip}", hide=True)
        with ctx.prefix("cd sagemaker-tests/tests/scripts/pytorch_scripts"):
            ctx.run("mkdir -p data", hide=True)
            ctx.run(
                "aws s3 cp s3://smdebug-testing/datasets/cifar-10-python.tar.gz data/cifar-10-batches-py.tar.gz",
                hide=True,
            )
            ctx.run("aws s3 cp s3://smdebug-testing/datasets/MNIST_pytorch.tar.gz data/MNIST_pytorch.tar.gz", hide=True)
            with ctx.prefix("cd data"):
                ctx.run("tar -zxf MNIST_pytorch.tar.gz", hide=True)
                ctx.run("tar -zxf cifar-10-batches-py.tar.gz", hide=True)

    run_sm_profiler_tests(pytorch_training, profiler_tests_dir, "test_profiler_pytorch.py", processor)
def setup_sm_benchmark_mx_train_env(resources_location):
    """
    Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts
    :param resources_location: <str> directory in which test resources should be placed
    :return: absolute path to the location of the virtual environment
    """
    ctx = Context()

    venv_dir = os.path.join(resources_location, "sm_benchmark_venv")
    if not os.path.isdir(venv_dir):
        ctx.run(f"virtualenv {venv_dir}")
        with ctx.prefix(f"source {venv_dir}/bin/activate"):
            ctx.run("pip install -U 'sagemaker<2' awscli boto3 botocore")
    return venv_dir
def execute_sagemaker_remote_tests(image):
    """
    Run pytest in a virtual env for a particular image
    Expected to run via multiprocessing
    :param image: ECR url
    """
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_REMOTE_TEST_TYPE)
    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            res = context.run(pytest_command, warn=True)
            metrics_utils.send_test_result_metrics(res.return_code)
def run_sagemaker_pytest_cmd(image):
    """
    Run pytest in a virtual env for a particular image

    Expected to run via multiprocessing

    :param image: ECR url
    """
    pytest_command, path, tag = generate_sagemaker_pytest_cmd(image)

    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            context.run(pytest_command)
Example #7
0
    def _run(cmd_string, custom_working_dir=None, custom_env=None):

        if not custom_working_dir:
            custom_working_dir = working_dir
        if not custom_env:
            custom_env = env
        cli_full_line = '"{}" {}'.format(cli_path, cmd_string)
        run_context = Context()
        # It might happen that we need to change directories between drives on Windows,
        # in that case the "/d" flag must be used otherwise directory wouldn't change
        cd_command = "cd"
        if platform.system() == "Windows":
            cd_command += " /d"
        # Context.cd() is not used since it doesn't work correctly on Windows.
        # It escapes spaces in the path using "\ " but it doesn't always work,
        # wrapping the path in quotation marks is the safest approach
        with run_context.prefix(f'{cd_command} "{custom_working_dir}"'):
            return run_context.run(cli_full_line, echo=False, hide=True, warn=True, env=custom_env, encoding="utf-8")
def execute_sagemaker_remote_tests(process_index, image, global_pytest_cache,
                                   pytest_cache_params):
    """
    Run pytest in a virtual env for a particular image. Creates a custom directory for each thread for pytest cache file.
    Stores pytest cache in a shared dict.  
    Expected to run via multiprocessing
    :param process_index - id for process. Used to create a custom cache dir 
    :param image - ECR url
    :param global_pytest_cache - shared Manager().dict() for cache merging
    :param pytest_cache_params - parameters required for s3 file path building
    """
    account_id = os.getenv(
        "ACCOUNT_ID",
        boto3.client("sts").get_caller_identity()["Account"])
    pytest_cache_util = PytestCache(boto3.client("s3"), account_id)
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_REMOTE_TEST_TYPE)
    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            pytest_cache_util.download_pytest_cache_from_s3_to_local(
                path,
                **pytest_cache_params,
                custom_cache_directory=str(process_index))
            # adding -o cache_dir with a custom directory name
            pytest_command += f" -o cache_dir={os.path.join(str(process_index), '.pytest_cache')}"
            res = context.run(pytest_command, warn=True)
            metrics_utils.send_test_result_metrics(res.return_code)
            cache_json = pytest_cache_util.convert_pytest_cache_file_to_json(
                path, custom_cache_directory=str(process_index))
            global_pytest_cache.update(cache_json)
            if res.failed:
                raise DLCSageMakerRemoteTestFailure(
                    f"{pytest_command} failed with error code: {res.return_code}\n"
                    f"Traceback:\n{res.stdout}")
    return None
Example #9
0
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env,
                                    setup_tf2_env):
    """
    Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts
    :param resources_location: <str> directory in which test resources should be placed
    :param setup_tf1_env: <bool> True if tf1 resources need to be setup
    :param setup_tf2_env: <bool> True if tf2 resources need to be setup
    :return: absolute path to the location of the virtual environment
    """
    ctx = Context()

    tf_resource_dir_list = []
    if setup_tf1_env:
        tf_resource_dir_list.append("tensorflow1")
    if setup_tf2_env:
        tf_resource_dir_list.append("tensorflow2")

    for resource_dir in tf_resource_dir_list:
        with ctx.cd(os.path.join(resources_location, resource_dir)):
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir, "horovod")):
                ctx.run("git clone https://github.com/horovod/horovod.git")
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir,
                                 "deep-learning-models")):
                # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files
                ctx.run(
                    f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git"
                )

    venv_dir = os.path.join(resources_location, "sm_benchmark_venv")
    if not os.path.isdir(venv_dir):
        ctx.run(f"virtualenv {venv_dir}")
        with ctx.prefix(f"source {venv_dir}/bin/activate"):
            ctx.run("pip install -U sagemaker awscli boto3 botocore six==1.11")
    return venv_dir
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
Example #11
0
class WorkspaceContext:
    def __init__(self, root):
        self.root = root
        self.mrover_build_root = os.path.join(os.path.expanduser('~'),
                                              '.mrover')
        self.jarvis_root = os.path.join(root, 'jarvis_files')
        self.third_party_root = os.path.join(root, '3rdparty')
        self.build_intermediate = os.path.join(self.mrover_build_root,
                                               'scratch')
        self.product_env = os.path.join(self.mrover_build_root, 'build_env')
        self.jarvis_env = os.path.join(self.mrover_build_root, 'jarvis_env')
        self.mbed_env = os.path.join(self.mrover_build_root, 'mbed_env')
        self.hash_store = os.path.join(self.mrover_build_root,
                                       'project_hashes')

        self.templates = Environment(loader=FileSystemLoader(
            os.path.join(self.jarvis_root, 'templates')))

        self.ctx = Context()

    def ensure_dir(self, d):
        """
        Creates a directory if it does not exist. After invocation of this
        function, you can be ensured the directory exists.

        Parameters:
        d - the path to the directory to create.

        Raises:
        BuildError if there is a file named `d`.
        """
        if not os.path.exists(d):
            os.makedirs(d)
        else:
            if not os.path.isdir(d):
                raise BuildError("{} already exists and is a file".format(d))

    def ensure_build_dirs(self):
        """
        Ensures the build directory structure exists.
        """
        self.ensure_dir(self.mrover_build_root)
        self.ensure_dir(self.hash_store)

    def ensure_product_env(self, clear=False):
        """
        Ensures the product venv existence. If clear is True, re-creates
        the product venv.
        """
        self.ensure_build_dirs()
        if not os.path.isdir(self.product_env) and not clear:
            venv.create(self.product_env,
                        clear=clear,
                        symlinks=True,
                        with_pip=True)

    def ensure_mbed_env(self):
        """
        Ensures the mbed venv exitence.
        """
        self.ensure_build_dirs()
        if not os.path.isdir(self.mbed_env):
            self.ctx.run('virtualenv --python=python2 {}'.format(
                self.mbed_env))

    @contextmanager
    def inside_product_env(self):
        """
        A context manager for activating the product venv.
        """
        with self.ctx.prefix("source {}/bin/activate".format(
                self.product_env)):
            yield

    @contextmanager
    def inside_mbed_env(self):
        """
        A context manager for activating the mbed venv.
        """
        with self.ctx.prefix("source {}/bin/activate".format(self.mbed_env)):
            yield

    def template(self, name, **kwargs):
        """
        Templates out a file and returns the rendered copy.
        """
        tpl = self.templates.get_template(name)
        return tpl.render(**kwargs)

    @contextmanager
    def cd(self, *args):
        with self.ctx.cd(*args):
            yield

    def run(self, *args, **kwargs):
        return self.ctx.run(*args, **kwargs)

    def get_product_file(self, *args):
        return os.path.join(self.product_env, *args)

    def get_jarvis_file(self, *args):
        return os.path.join(self.jarvis_env, *args)

    def get_mbed_file(self, *args):
        return os.path.join(self.mbed_env, *args)

    @contextmanager
    def intermediate(self, name, cleanup=False):
        """
        Create an intermediate build directory, then change directory to it.
        """
        intermediate = os.path.join(self.build_intermediate, name)
        self.ensure_dir(intermediate)
        if os.listdir(intermediate) and cleanup:
            shutil.rmtree(intermediate)
            self.ensure_dir(intermediate)

        with self.cd(intermediate):
            yield intermediate

        if cleanup:
            shutil.rmtree(intermediate)
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor):
    """
    Testrunner to execute SM profiler tests from DLC repo
    """
    ctx = Context()

    # Install profiler requirements only once - pytest-rerunfailures has a known issue
    # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128
    try:
        ctx.run(
            "pip install -r "
            "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && "
            "pip install smdebug && "
            "pip uninstall -y pytest-rerunfailures",
            hide=True,
        )
    except UnexpectedExit:
        # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky
        time.sleep(90)

    framework, version = get_framework_and_version_from_tag(image)

    # Conditionally set sm data parallel tests, based on config file rules from link below:
    # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler
    enable_sm_data_parallel_tests = "true"
    if framework == "pytorch" and Version(version) < Version("1.6"):
        enable_sm_data_parallel_tests = "false"
    if framework == "tensorflow" and Version(version) < Version("2.3"):
        enable_sm_data_parallel_tests = "false"

    # Set SMProfiler specific environment variables
    smprof_configs = {
        "use_current_branch": "false",
        "enable_smdataparallel_tests": enable_sm_data_parallel_tests,
        "force_run_tests": "false",
        "framework": framework,
        "build_type": "release"
    }

    # Command to set all necessary environment variables
    export_cmd = " && ".join(f"export {key}={val}"
                             for key, val in smprof_configs.items())
    export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \
                 f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}"

    test_results_outfile = os.path.join(
        os.getcwd(), f"{get_container_name('smprof', image)}.txt")
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"):
            try:
                ctx.run(
                    f"pytest --json-report --json-report-file={test_results_outfile} -n=auto "
                    f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs",
                    hide=True,
                )
                with open(test_results_outfile) as outfile:
                    result_data = json.load(outfile)
                    LOGGER.info(
                        f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}"
                    )
            except Exception as e:
                if os.path.exists(test_results_outfile):
                    with open(test_results_outfile) as outfile:
                        result_data = json.load(outfile)
                    raise SMProfilerRCTestFailure(
                        f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}"
                    ) from e
                raise
Example #13
0
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param tensorflow_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    _print_results_of_test(os.path.join(test_dir, log_file), processor)

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Example #14
0
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes,
                                              region, gpu_only, py3_only):
    """
    Run MX sagemaker training performance test

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs
    some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv.

    The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh
    The shell script sets num-epochs to 40. This parameter is configurable.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file
    TODO: Change latency [time/epoch] metric to Throughput metric

    :param mxnet_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
    py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
    ec2_instance_type = "p3.16xlarge"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet",
                                          framework_version, "sagemaker",
                                          "training", device_cuda_str,
                                          py_version)
    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 90m python mx_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {mxnet_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not run_out.ok:
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}",
        warn=True,
        echo=True)

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, time_val, accuracy = _print_results_of_test(
        os.path.join(test_dir, log_file))

    accuracy_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD)
    assert accuracy > accuracy_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}"
    )

    time_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD)
    assert time_val < time_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}"
    )
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param xla: [ True | False ] Enable XLA acceleration
    :param num_nodes: Number of nodes to run on
    :param region: AWS region

    This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py

    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)

    processor = "xla" if xla else "gpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}"
    '''
    TODO: Switch to p3.16xlarge when EC2 availability issues are resolved
    '''
    ec2_instance_type = "p3.8xlarge"
    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(
        BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
    )
    training_job_name = (
        f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"--xla-{'on' if xla else 'off'} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location, "failure_log")

    ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}")

    LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}")

    result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file))
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    if threshold:
        assert throughput > threshold, (
            f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes "
            f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}"
        )
    return throughput
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 > {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )