Python DebuggerHookConfig Examples

Programming Language: Python

Namespace/Package Name: sagemaker.debugger.debugger

Examples at hotexamples.com: 3

Python DebuggerHookConfig - 3 examples found. These are the top rated real world Python examples of sagemaker.debugger.debugger.DebuggerHookConfig extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DebuggerHookConfig(3)

_to_request_dict(2)

Example #1

Show file

File: test_debugger.py Project: owen-t/sagemaker-python-sdk

def test_mxnet_with_custom_rule_and_debugger_hook_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [_get_custom_rule(sagemaker_session)]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(
                "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors"
            )
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (
                job_description["DebugRuleConfigurations"][index]["RuleConfigurationName"]
                == rule.name
            )
            assert (
                job_description["DebugRuleConfigurations"][index]["RuleEvaluatorImage"]
                == rule.image_uri
            )
            assert job_description["DebugRuleConfigurations"][index]["VolumeSizeInGB"] == 30
        assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()

        assert (
            _get_rule_evaluation_statuses(job_description)
            == mx.latest_training_job.rule_job_summary()
        )

        _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)

Example #2

Show file

File: test_debugger.py Project: owen-t/sagemaker-python-sdk

def test_mxnet_with_debugger_hook_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(
                "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors"
            )
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()
        assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()

        _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)

Example #3

Show file

def test_debug_hook_disabled_with_checkpointing(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        s3_output_path = os.path.join("s3://",
                                      sagemaker_session.default_bucket(),
                                      str(uuid.uuid4()))
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(s3_output_path, "tensors"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")

        # Estimator with checkpointing enabled
        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        mx._prepare_for_training()

        # Debug Hook should be enabled
        assert mx.debugger_hook_config is not None

        # Estimator with checkpointing enabled and Instance Count>1
        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=2,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        mx._prepare_for_training()
        # Debug Hook should be disabled
        assert mx.debugger_hook_config is False

        # Estimator with checkpointing enabled and SMDataParallel Enabled
        pt = PyTorch(
            base_job_name="pytorch-smdataparallel-mnist",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="1.8.0",
            py_version="py36",
            instance_count=1,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
            distribution={
                "smdistributed": {
                    "dataparallel": {
                        "enabled": True
                    }
                }
            },
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        pt._prepare_for_training()
        # Debug Hook should be disabled
        assert pt.debugger_hook_config is False

        # Estimator with checkpointing enabled and SMModelParallel Enabled
        tf = TensorFlow(
            base_job_name="tf-smdataparallel-mnist",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="2.4.1",
            py_version="py36",
            instance_count=1,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
            distribution={
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True
                    }
                }
            },
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        tf._prepare_for_training()
        # Debug Hook should be disabled
        assert tf.debugger_hook_config is False

        # Estimator with checkpointing enabled with Xgboost Estimator
        xg = XGBoost(
            base_job_name="test_xgboost",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="1.2-1",
            py_version="py3",
            instance_count=2,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
        )
        xg._prepare_for_training()
        # Debug Hook should be enabled
        assert xg.debugger_hook_config is not None