def test_training_step(sagemaker_session): estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="c4.4xlarge", profiler_config=ProfilerConfig(system_monitor_interval_millis=500), rules=[], sagemaker_session=sagemaker_session, ) inputs = TrainingInput(f"s3://{BUCKET}/train_manifest") cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config) assert step.to_request() == { "Name": "MyTrainingStep", "Type": "Training", "Arguments": { "AlgorithmSpecification": { "TrainingImage": IMAGE_URI, "TrainingInputMode": "File" }, "InputDataConfig": [{ "ChannelName": "training", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/train_manifest", } }, }], "OutputDataConfig": { "S3OutputPath": f"s3://{BUCKET}/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "c4.4xlarge", "VolumeSizeInGB": 30, }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ProfilerConfig": { "ProfilingIntervalInMilliseconds": 500, "S3OutputPath": f"s3://{BUCKET}/", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TrainingJobName.expr == { "Get": "Steps.MyTrainingStep.TrainingJobName" }
def test_mxnet_with_default_profiler_config_and_profiler_rule( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"] == ProfilerConfig( s3_output_path=mx.output_path, system_monitor_interval_millis=500)._to_request_dict()) assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" } with pytest.raises(ValueError) as error: mx.enable_default_profiling() assert "Debugger monitoring is already enabled." in str(error)
def test_mxnet_with_built_in_profiler_rule_with_custom_parameters( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): custom_profiler_report_rule = ProfilerRule.sagemaker( rule_configs.ProfilerReport(CPUBottleneck_threshold=90), name="CustomProfilerReportRule") script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=[custom_profiler_report_rule], ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get("ProfilingStatus") == "Enabled" assert (job_description.get("ProfilerConfig") == ProfilerConfig( s3_output_path=mx.output_path, system_monitor_interval_millis=500)._to_request_dict()) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", "CPUBottleneck_threshold": "90", }
def test_compilation_step(sagemaker_session): estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="ml.c5.4xlarge", profiler_config=ProfilerConfig(system_monitor_interval_millis=500), rules=[], sagemaker_session=sagemaker_session, ) model = Model( image_uri=IMAGE_URI, model_data="s3://output/tensorflow.tar.gz", sagemaker_session=sagemaker_session, ) compilation_input = CompilationInput( target_instance_type="ml_inf", input_shape={"data": [1, 3, 1024, 1024]}, output_path="s3://output", compile_max_run=100, framework="tensorflow", job_name="compile-model", compiler_options=None, ) compilation_step = CompilationStep(name="MyCompilationStep", estimator=estimator, model=model, inputs=compilation_input) assert compilation_step.to_request() == { "Name": "MyCompilationStep", "Type": "Compilation", "Arguments": { "InputConfig": { "DataInputConfig": '{"data": [1, 3, 1024, 1024]}', "Framework": "TENSORFLOW", "S3Uri": "s3://output/tensorflow.tar.gz", }, "OutputConfig": { "S3OutputLocation": "s3://output", "TargetDevice": "ml_inf" }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 100 }, "Tags": [], }, }
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( framework_profile_params=FrameworkProfile(start_step=1, num_steps=5)) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == profiler_config._to_request_dict()["ProfilingParameters"]) assert job_description.get("ProfilingStatus") == "Enabled" _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) updated_framework_profile = FrameworkProfile( detailed_profiling_config=DetailedProfilingConfig( profile_default_steps=True)) mx.update_profiler(framework_profile_params=updated_framework_profile) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == updated_framework_profile.profiling_parameters) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), ProfilerRule.sagemaker(rule_configs.ProfilerReport(), name="CustomProfilerReportRule"), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors", ) profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, framework_profile_params=FrameworkProfile(), ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description[ "ProfilerConfig"] == profiler_config._to_request_dict() assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", } for index, rule in enumerate(mx.debugger_rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler(disable_framework_metrics=True) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get( "ProfilerConfig") == profiler_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" } _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler( rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())], system_monitor_interval_millis=500, ) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"][ "S3OutputPath"] == profiler_config.s3_output_path assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 profiler_report_rule_config = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"]) assert profiler_report_rule_config[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_report_rule_config["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }
def test_training_step_base_estimator(sagemaker_session): instance_type_parameter = ParameterString(name="InstanceType", default_value="c4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) data_source_uri_parameter = ParameterString( name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest") training_epochs_parameter = ParameterInteger(name="TrainingEpochs", default_value=5) training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize", default_value=500) estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, profiler_config=ProfilerConfig(system_monitor_interval_millis=500), hyperparameters={ "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, }, rules=[], sagemaker_session=sagemaker_session, ) inputs = TrainingInput(s3_data=data_source_uri_parameter) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TrainingStep( name="MyTrainingStep", depends_on=["TestStep"], estimator=estimator, inputs=inputs, cache_config=cache_config, ) step.add_depends_on(["AnotherTestStep"]) assert step.to_request() == { "Name": "MyTrainingStep", "Type": "Training", "DependsOn": ["TestStep", "AnotherTestStep"], "Arguments": { "AlgorithmSpecification": { "TrainingImage": IMAGE_URI, "TrainingInputMode": "File" }, "HyperParameters": { "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, }, "InputDataConfig": [{ "ChannelName": "training", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, } }, }], "OutputDataConfig": { "S3OutputPath": f"s3://{BUCKET}/" }, "ResourceConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ProfilerConfig": { "ProfilingIntervalInMilliseconds": 500, "S3OutputPath": f"s3://{BUCKET}/", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TrainingJobName.expr == { "Get": "Steps.MyTrainingStep.TrainingJobName" }
def test_single_algo_tuning_step(sagemaker_session): data_source_uri_parameter = ParameterString( name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest") estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="ml.c5.4xlarge", profiler_config=ProfilerConfig(system_monitor_interval_millis=500), rules=[], sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters( num_layers=18, image_shape="3,224,224", num_classes=257, num_training_samples=15420, mini_batch_size=128, epochs=10, optimizer="sgd", top_k="2", precision_dtype="float32", augmentation_type="crop", ) hyperparameter_ranges = { "learning_rate": ContinuousParameter(0.0001, 0.05), "momentum": ContinuousParameter(0.0, 0.99), "weight_decay": ContinuousParameter(0.0, 0.99), } tuner = HyperparameterTuner( estimator=estimator, objective_metric_name="val:accuracy", hyperparameter_ranges=hyperparameter_ranges, objective_type="Maximize", max_jobs=5, max_parallel_jobs=2, early_stopping_type="OFF", strategy="Bayesian", warm_start_config=WarmStartConfig( warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM, parents=set(["parent-hpo"]), ), ) inputs = TrainingInput(s3_data=data_source_uri_parameter) tuning_step = TuningStep( name="MyTuningStep", tuner=tuner, inputs=inputs, ) assert tuning_step.to_request() == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": { "HyperParameterTuningJobConfig": { "Strategy": "Bayesian", "ResourceLimits": { "MaxNumberOfTrainingJobs": 5, "MaxParallelTrainingJobs": 2 }, "TrainingJobEarlyStoppingType": "OFF", "HyperParameterTuningJobObjective": { "Type": "Maximize", "MetricName": "val:accuracy", }, "ParameterRanges": { "ContinuousParameterRanges": [ { "Name": "learning_rate", "MinValue": "0.0001", "MaxValue": "0.05", "ScalingType": "Auto", }, { "Name": "momentum", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, { "Name": "weight_decay", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, ], "CategoricalParameterRanges": [], "IntegerParameterRanges": [], }, }, "TrainingJobDefinition": { "StaticHyperParameters": { "num_layers": "18", "image_shape": "3,224,224", "num_classes": "257", "num_training_samples": "15420", "mini_batch_size": "128", "epochs": "10", "optimizer": "sgd", "top_k": "2", "precision_dtype": "float32", "augmentation_type": "crop", }, "RoleArn": "DummyRole", "OutputDataConfig": { "S3OutputPath": "s3://my-bucket/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.c5.4xlarge", "VolumeSizeInGB": 30, }, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "fakeimage", }, "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", }], }, "WarmStartConfig": { "WarmStartType": "IdenticalDataAndAlgorithm", "ParentHyperParameterTuningJobs": [{ "HyperParameterTuningJobName": "parent-hpo", }], }, }, } assert tuning_step.properties.HyperParameterTuningJobName.expr == { "Get": "Steps.MyTuningStep.HyperParameterTuningJobName" } assert tuning_step.properties.TrainingJobSummaries[ 0].TrainingJobName.expr == { "Get": "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName" } assert tuning_step.get_top_model_s3_uri( 0, "my-bucket", "my-prefix" ).expr == { "Std:Join": { "On": "/", "Values": [ "s3:/", "my-bucket", "my-prefix", { "Get": "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName" }, "output/model.tar.gz", ], } }
def test_multi_algo_tuning_step(sagemaker_session): data_source_uri_parameter = ParameterString( name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest") instance_count = ParameterInteger(name="InstanceCount", default_value=1) estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count, instance_type="ml.c5.4xlarge", profiler_config=ProfilerConfig(system_monitor_interval_millis=500), rules=[], sagemaker_session=sagemaker_session, max_retry_attempts=10, ) estimator.set_hyperparameters( num_layers=18, image_shape="3,224,224", num_classes=257, num_training_samples=15420, mini_batch_size=128, epochs=10, optimizer="sgd", top_k="2", precision_dtype="float32", augmentation_type="crop", ) initial_lr_param = ParameterString(name="InitialLR", default_value="0.0001") hyperparameter_ranges = { "learning_rate": ContinuousParameter(initial_lr_param, 0.05), "momentum": ContinuousParameter(0.0, 0.99), "weight_decay": ContinuousParameter(0.0, 0.99), } tuner = HyperparameterTuner.create( estimator_dict={ "estimator-1": estimator, "estimator-2": estimator, }, objective_type="Minimize", objective_metric_name_dict={ "estimator-1": "val:loss", "estimator-2": "val:loss", }, hyperparameter_ranges_dict={ "estimator-1": hyperparameter_ranges, "estimator-2": hyperparameter_ranges, }, ) inputs = TrainingInput(s3_data=data_source_uri_parameter) tuning_step = TuningStep( name="MyTuningStep", tuner=tuner, inputs={ "estimator-1": inputs, "estimator-2": inputs, }, ) assert tuning_step.to_request() == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": { "HyperParameterTuningJobConfig": { "Strategy": "Bayesian", "ResourceLimits": { "MaxNumberOfTrainingJobs": 1, "MaxParallelTrainingJobs": 1 }, "TrainingJobEarlyStoppingType": "Off", }, "TrainingJobDefinitions": [ { "StaticHyperParameters": { "num_layers": "18", "image_shape": "3,224,224", "num_classes": "257", "num_training_samples": "15420", "mini_batch_size": "128", "epochs": "10", "optimizer": "sgd", "top_k": "2", "precision_dtype": "float32", "augmentation_type": "crop", }, "RoleArn": "DummyRole", "OutputDataConfig": { "S3OutputPath": "s3://my-bucket/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.c5.4xlarge", "VolumeSizeInGB": 30, }, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "fakeimage", }, "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", }], "DefinitionName": "estimator-1", "TuningObjective": { "Type": "Minimize", "MetricName": "val:loss" }, "HyperParameterRanges": { "ContinuousParameterRanges": [ { "Name": "learning_rate", "MinValue": initial_lr_param, "MaxValue": "0.05", "ScalingType": "Auto", }, { "Name": "momentum", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, { "Name": "weight_decay", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, ], "CategoricalParameterRanges": [], "IntegerParameterRanges": [], }, "RetryStrategy": { "MaximumRetryAttempts": 10, }, }, { "StaticHyperParameters": { "num_layers": "18", "image_shape": "3,224,224", "num_classes": "257", "num_training_samples": "15420", "mini_batch_size": "128", "epochs": "10", "optimizer": "sgd", "top_k": "2", "precision_dtype": "float32", "augmentation_type": "crop", }, "RoleArn": "DummyRole", "OutputDataConfig": { "S3OutputPath": "s3://my-bucket/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.c5.4xlarge", "VolumeSizeInGB": 30, }, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "fakeimage", }, "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", }], "DefinitionName": "estimator-2", "TuningObjective": { "Type": "Minimize", "MetricName": "val:loss" }, "HyperParameterRanges": { "ContinuousParameterRanges": [ { "Name": "learning_rate", "MinValue": initial_lr_param, "MaxValue": "0.05", "ScalingType": "Auto", }, { "Name": "momentum", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, { "Name": "weight_decay", "MinValue": "0.0", "MaxValue": "0.99", "ScalingType": "Auto", }, ], "CategoricalParameterRanges": [], "IntegerParameterRanges": [], }, "RetryStrategy": { "MaximumRetryAttempts": 10, }, }, ], }, }