def test_training_step(sagemaker_session):
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="c4.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    inputs = TrainingInput(f"s3://{BUCKET}/train_manifest")
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TrainingStep(name="MyTrainingStep",
                        estimator=estimator,
                        inputs=inputs,
                        cache_config=cache_config)
    assert step.to_request() == {
        "Name": "MyTrainingStep",
        "Type": "Training",
        "Arguments": {
            "AlgorithmSpecification": {
                "TrainingImage": IMAGE_URI,
                "TrainingInputMode": "File"
            },
            "InputDataConfig": [{
                "ChannelName": "training",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": f"s3://{BUCKET}/train_manifest",
                    }
                },
            }],
            "OutputDataConfig": {
                "S3OutputPath": f"s3://{BUCKET}/"
            },
            "ResourceConfig": {
                "InstanceCount": 1,
                "InstanceType": "c4.4xlarge",
                "VolumeSizeInGB": 30,
            },
            "RoleArn":
            ROLE,
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 86400
            },
            "ProfilerConfig": {
                "ProfilingIntervalInMilliseconds": 500,
                "S3OutputPath": f"s3://{BUCKET}/",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TrainingJobName.expr == {
        "Get": "Steps.MyTrainingStep.TrainingJobName"
    }
Example #2
0
def test_mxnet_with_default_profiler_config_and_profiler_rule(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"] == ProfilerConfig(
            s3_output_path=mx.output_path,
            system_monitor_interval_millis=500)._to_request_dict())
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }

        with pytest.raises(ValueError) as error:
            mx.enable_default_profiling()
        assert "Debugger monitoring is already enabled." in str(error)
Example #3
0
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        custom_profiler_report_rule = ProfilerRule.sagemaker(
            rule_configs.ProfilerReport(CPUBottleneck_threshold=90),
            name="CustomProfilerReportRule")
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=[custom_profiler_report_rule],
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description.get("ProfilingStatus") == "Enabled"
        assert (job_description.get("ProfilerConfig") == ProfilerConfig(
            s3_output_path=mx.output_path,
            system_monitor_interval_millis=500)._to_request_dict())

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
            "CPUBottleneck_threshold": "90",
        }
Example #4
0
def test_compilation_step(sagemaker_session):
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="ml.c5.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
    )

    model = Model(
        image_uri=IMAGE_URI,
        model_data="s3://output/tensorflow.tar.gz",
        sagemaker_session=sagemaker_session,
    )

    compilation_input = CompilationInput(
        target_instance_type="ml_inf",
        input_shape={"data": [1, 3, 1024, 1024]},
        output_path="s3://output",
        compile_max_run=100,
        framework="tensorflow",
        job_name="compile-model",
        compiler_options=None,
    )
    compilation_step = CompilationStep(name="MyCompilationStep",
                                       estimator=estimator,
                                       model=model,
                                       inputs=compilation_input)

    assert compilation_step.to_request() == {
        "Name": "MyCompilationStep",
        "Type": "Compilation",
        "Arguments": {
            "InputConfig": {
                "DataInputConfig": '{"data": [1, 3, 1024, 1024]}',
                "Framework": "TENSORFLOW",
                "S3Uri": "s3://output/tensorflow.tar.gz",
            },
            "OutputConfig": {
                "S3OutputLocation": "s3://output",
                "TargetDevice": "ml_inf"
            },
            "RoleArn": ROLE,
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 100
            },
            "Tags": [],
        },
    }
Example #5
0
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            framework_profile_params=FrameworkProfile(start_step=1,
                                                      num_steps=5))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                profiler_config._to_request_dict()["ProfilingParameters"])
        assert job_description.get("ProfilingStatus") == "Enabled"

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        updated_framework_profile = FrameworkProfile(
            detailed_profiling_config=DetailedProfilingConfig(
                profile_default_steps=True))
        mx.update_profiler(framework_profile_params=updated_framework_profile)

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                updated_framework_profile.profiling_parameters)

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }
Example #6
0
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            ProfilerRule.sagemaker(rule_configs.ProfilerReport(),
                                   name="CustomProfilerReportRule"),
        ]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors",
        )
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
            framework_profile_params=FrameworkProfile(),
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description[
            "ProfilerConfig"] == profiler_config._to_request_dict()
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
        }

        for index, rule in enumerate(mx.debugger_rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(disable_framework_metrics=True)
        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
Example #7
0
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
        )
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description.get(
            "ProfilerConfig") == profiler_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(
            rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())],
            system_monitor_interval_millis=500,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"][
            "S3OutputPath"] == profiler_config.s3_output_path
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500

        profiler_report_rule_config = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_report_rule_config["RuleConfigurationName"])
        assert profiler_report_rule_config[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_report_rule_config["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }
def test_training_step_base_estimator(sagemaker_session):
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="c4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    training_epochs_parameter = ParameterInteger(name="TrainingEpochs",
                                                 default_value=5)
    training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize",
                                                     default_value=500)
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        hyperparameters={
            "batch-size": training_batch_size_parameter,
            "epochs": training_epochs_parameter,
        },
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    inputs = TrainingInput(s3_data=data_source_uri_parameter)
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TrainingStep(
        name="MyTrainingStep",
        depends_on=["TestStep"],
        estimator=estimator,
        inputs=inputs,
        cache_config=cache_config,
    )
    step.add_depends_on(["AnotherTestStep"])
    assert step.to_request() == {
        "Name": "MyTrainingStep",
        "Type": "Training",
        "DependsOn": ["TestStep", "AnotherTestStep"],
        "Arguments": {
            "AlgorithmSpecification": {
                "TrainingImage": IMAGE_URI,
                "TrainingInputMode": "File"
            },
            "HyperParameters": {
                "batch-size": training_batch_size_parameter,
                "epochs": training_epochs_parameter,
            },
            "InputDataConfig": [{
                "ChannelName": "training",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": data_source_uri_parameter,
                    }
                },
            }],
            "OutputDataConfig": {
                "S3OutputPath": f"s3://{BUCKET}/"
            },
            "ResourceConfig": {
                "InstanceCount": instance_count_parameter,
                "InstanceType": instance_type_parameter,
                "VolumeSizeInGB": 30,
            },
            "RoleArn":
            ROLE,
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 86400
            },
            "ProfilerConfig": {
                "ProfilingIntervalInMilliseconds": 500,
                "S3OutputPath": f"s3://{BUCKET}/",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TrainingJobName.expr == {
        "Get": "Steps.MyTrainingStep.TrainingJobName"
    }
Example #9
0
def test_single_algo_tuning_step(sagemaker_session):
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="ml.c5.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    estimator.set_hyperparameters(
        num_layers=18,
        image_shape="3,224,224",
        num_classes=257,
        num_training_samples=15420,
        mini_batch_size=128,
        epochs=10,
        optimizer="sgd",
        top_k="2",
        precision_dtype="float32",
        augmentation_type="crop",
    )

    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(0.0001, 0.05),
        "momentum": ContinuousParameter(0.0, 0.99),
        "weight_decay": ContinuousParameter(0.0, 0.99),
    }

    tuner = HyperparameterTuner(
        estimator=estimator,
        objective_metric_name="val:accuracy",
        hyperparameter_ranges=hyperparameter_ranges,
        objective_type="Maximize",
        max_jobs=5,
        max_parallel_jobs=2,
        early_stopping_type="OFF",
        strategy="Bayesian",
        warm_start_config=WarmStartConfig(
            warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM,
            parents=set(["parent-hpo"]),
        ),
    )

    inputs = TrainingInput(s3_data=data_source_uri_parameter)

    tuning_step = TuningStep(
        name="MyTuningStep",
        tuner=tuner,
        inputs=inputs,
    )

    assert tuning_step.to_request() == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": {
            "HyperParameterTuningJobConfig": {
                "Strategy": "Bayesian",
                "ResourceLimits": {
                    "MaxNumberOfTrainingJobs": 5,
                    "MaxParallelTrainingJobs": 2
                },
                "TrainingJobEarlyStoppingType": "OFF",
                "HyperParameterTuningJobObjective": {
                    "Type": "Maximize",
                    "MetricName": "val:accuracy",
                },
                "ParameterRanges": {
                    "ContinuousParameterRanges": [
                        {
                            "Name": "learning_rate",
                            "MinValue": "0.0001",
                            "MaxValue": "0.05",
                            "ScalingType": "Auto",
                        },
                        {
                            "Name": "momentum",
                            "MinValue": "0.0",
                            "MaxValue": "0.99",
                            "ScalingType": "Auto",
                        },
                        {
                            "Name": "weight_decay",
                            "MinValue": "0.0",
                            "MaxValue": "0.99",
                            "ScalingType": "Auto",
                        },
                    ],
                    "CategoricalParameterRanges": [],
                    "IntegerParameterRanges": [],
                },
            },
            "TrainingJobDefinition": {
                "StaticHyperParameters": {
                    "num_layers": "18",
                    "image_shape": "3,224,224",
                    "num_classes": "257",
                    "num_training_samples": "15420",
                    "mini_batch_size": "128",
                    "epochs": "10",
                    "optimizer": "sgd",
                    "top_k": "2",
                    "precision_dtype": "float32",
                    "augmentation_type": "crop",
                },
                "RoleArn":
                "DummyRole",
                "OutputDataConfig": {
                    "S3OutputPath": "s3://my-bucket/"
                },
                "ResourceConfig": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.c5.4xlarge",
                    "VolumeSizeInGB": 30,
                },
                "StoppingCondition": {
                    "MaxRuntimeInSeconds": 86400
                },
                "AlgorithmSpecification": {
                    "TrainingInputMode": "File",
                    "TrainingImage": "fakeimage",
                },
                "InputDataConfig": [{
                    "DataSource": {
                        "S3DataSource": {
                            "S3DataType": "S3Prefix",
                            "S3Uri": data_source_uri_parameter,
                            "S3DataDistributionType": "FullyReplicated",
                        }
                    },
                    "ChannelName": "training",
                }],
            },
            "WarmStartConfig": {
                "WarmStartType":
                "IdenticalDataAndAlgorithm",
                "ParentHyperParameterTuningJobs": [{
                    "HyperParameterTuningJobName":
                    "parent-hpo",
                }],
            },
        },
    }

    assert tuning_step.properties.HyperParameterTuningJobName.expr == {
        "Get": "Steps.MyTuningStep.HyperParameterTuningJobName"
    }
    assert tuning_step.properties.TrainingJobSummaries[
        0].TrainingJobName.expr == {
            "Get": "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName"
        }
    assert tuning_step.get_top_model_s3_uri(
        0, "my-bucket", "my-prefix"
    ).expr == {
        "Std:Join": {
            "On":
            "/",
            "Values": [
                "s3:/",
                "my-bucket",
                "my-prefix",
                {
                    "Get":
                    "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName"
                },
                "output/model.tar.gz",
            ],
        }
    }
Example #10
0
def test_multi_algo_tuning_step(sagemaker_session):
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count,
        instance_type="ml.c5.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
        max_retry_attempts=10,
    )

    estimator.set_hyperparameters(
        num_layers=18,
        image_shape="3,224,224",
        num_classes=257,
        num_training_samples=15420,
        mini_batch_size=128,
        epochs=10,
        optimizer="sgd",
        top_k="2",
        precision_dtype="float32",
        augmentation_type="crop",
    )

    initial_lr_param = ParameterString(name="InitialLR",
                                       default_value="0.0001")
    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(initial_lr_param, 0.05),
        "momentum": ContinuousParameter(0.0, 0.99),
        "weight_decay": ContinuousParameter(0.0, 0.99),
    }

    tuner = HyperparameterTuner.create(
        estimator_dict={
            "estimator-1": estimator,
            "estimator-2": estimator,
        },
        objective_type="Minimize",
        objective_metric_name_dict={
            "estimator-1": "val:loss",
            "estimator-2": "val:loss",
        },
        hyperparameter_ranges_dict={
            "estimator-1": hyperparameter_ranges,
            "estimator-2": hyperparameter_ranges,
        },
    )

    inputs = TrainingInput(s3_data=data_source_uri_parameter)

    tuning_step = TuningStep(
        name="MyTuningStep",
        tuner=tuner,
        inputs={
            "estimator-1": inputs,
            "estimator-2": inputs,
        },
    )

    assert tuning_step.to_request() == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": {
            "HyperParameterTuningJobConfig": {
                "Strategy": "Bayesian",
                "ResourceLimits": {
                    "MaxNumberOfTrainingJobs": 1,
                    "MaxParallelTrainingJobs": 1
                },
                "TrainingJobEarlyStoppingType": "Off",
            },
            "TrainingJobDefinitions": [
                {
                    "StaticHyperParameters": {
                        "num_layers": "18",
                        "image_shape": "3,224,224",
                        "num_classes": "257",
                        "num_training_samples": "15420",
                        "mini_batch_size": "128",
                        "epochs": "10",
                        "optimizer": "sgd",
                        "top_k": "2",
                        "precision_dtype": "float32",
                        "augmentation_type": "crop",
                    },
                    "RoleArn":
                    "DummyRole",
                    "OutputDataConfig": {
                        "S3OutputPath": "s3://my-bucket/"
                    },
                    "ResourceConfig": {
                        "InstanceCount": 1,
                        "InstanceType": "ml.c5.4xlarge",
                        "VolumeSizeInGB": 30,
                    },
                    "StoppingCondition": {
                        "MaxRuntimeInSeconds": 86400
                    },
                    "AlgorithmSpecification": {
                        "TrainingInputMode": "File",
                        "TrainingImage": "fakeimage",
                    },
                    "InputDataConfig": [{
                        "DataSource": {
                            "S3DataSource": {
                                "S3DataType": "S3Prefix",
                                "S3Uri": data_source_uri_parameter,
                                "S3DataDistributionType": "FullyReplicated",
                            }
                        },
                        "ChannelName": "training",
                    }],
                    "DefinitionName":
                    "estimator-1",
                    "TuningObjective": {
                        "Type": "Minimize",
                        "MetricName": "val:loss"
                    },
                    "HyperParameterRanges": {
                        "ContinuousParameterRanges": [
                            {
                                "Name": "learning_rate",
                                "MinValue": initial_lr_param,
                                "MaxValue": "0.05",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "momentum",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "weight_decay",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                        ],
                        "CategoricalParameterRanges": [],
                        "IntegerParameterRanges": [],
                    },
                    "RetryStrategy": {
                        "MaximumRetryAttempts": 10,
                    },
                },
                {
                    "StaticHyperParameters": {
                        "num_layers": "18",
                        "image_shape": "3,224,224",
                        "num_classes": "257",
                        "num_training_samples": "15420",
                        "mini_batch_size": "128",
                        "epochs": "10",
                        "optimizer": "sgd",
                        "top_k": "2",
                        "precision_dtype": "float32",
                        "augmentation_type": "crop",
                    },
                    "RoleArn":
                    "DummyRole",
                    "OutputDataConfig": {
                        "S3OutputPath": "s3://my-bucket/"
                    },
                    "ResourceConfig": {
                        "InstanceCount": 1,
                        "InstanceType": "ml.c5.4xlarge",
                        "VolumeSizeInGB": 30,
                    },
                    "StoppingCondition": {
                        "MaxRuntimeInSeconds": 86400
                    },
                    "AlgorithmSpecification": {
                        "TrainingInputMode": "File",
                        "TrainingImage": "fakeimage",
                    },
                    "InputDataConfig": [{
                        "DataSource": {
                            "S3DataSource": {
                                "S3DataType": "S3Prefix",
                                "S3Uri": data_source_uri_parameter,
                                "S3DataDistributionType": "FullyReplicated",
                            }
                        },
                        "ChannelName": "training",
                    }],
                    "DefinitionName":
                    "estimator-2",
                    "TuningObjective": {
                        "Type": "Minimize",
                        "MetricName": "val:loss"
                    },
                    "HyperParameterRanges": {
                        "ContinuousParameterRanges": [
                            {
                                "Name": "learning_rate",
                                "MinValue": initial_lr_param,
                                "MaxValue": "0.05",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "momentum",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "weight_decay",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                        ],
                        "CategoricalParameterRanges": [],
                        "IntegerParameterRanges": [],
                    },
                    "RetryStrategy": {
                        "MaximumRetryAttempts": 10,
                    },
                },
            ],
        },
    }