Esempio n. 1
0
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            ProfilerRule.sagemaker(rule_configs.ProfilerReport(),
                                   name="CustomProfilerReportRule"),
        ]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors",
        )
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
            framework_profile_params=FrameworkProfile(),
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description[
            "ProfilerConfig"] == profiler_config._to_request_dict()
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
        }

        for index, rule in enumerate(mx.debugger_rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(disable_framework_metrics=True)
        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
Esempio n. 2
0
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            framework_profile_params=FrameworkProfile(start_step=1,
                                                      num_steps=5))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                profiler_config._to_request_dict()["ProfilingParameters"])
        assert job_description.get("ProfilingStatus") == "Enabled"

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        updated_framework_profile = FrameworkProfile(
            detailed_profiling_config=DetailedProfilingConfig(
                profile_default_steps=True))
        mx.update_profiler(framework_profile_params=updated_framework_profile)

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                updated_framework_profile.profiling_parameters)

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }
Esempio n. 3
0
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
        )
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description.get(
            "ProfilerConfig") == profiler_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(
            rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())],
            system_monitor_interval_millis=500,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"][
            "S3OutputPath"] == profiler_config.s3_output_path
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500

        profiler_report_rule_config = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_report_rule_config["RuleConfigurationName"])
        assert profiler_report_rule_config[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_report_rule_config["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }