def test_attach_wrong_framework(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4' rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_attach_wrong_framework(sagemaker_session): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4" rjd = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_coach_mxnet(sagemaker_session, coach_mxnet_latest_version, cpu_instance_type): estimator = _test_coach( sagemaker_session, RLFramework.MXNET, coach_mxnet_latest_version, cpu_instance_type ) job_name = unique_name_from_base("test-coach-mxnet") with timeout(minutes=15): estimator.fit(wait="False", job_name=job_name) estimator = RLEstimator.attach( estimator.latest_training_job.name, sagemaker_session=sagemaker_session ) endpoint_name = "test-mxnet-coach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( 1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name ) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert 0 < action[0][0] < 1 assert 0 < action[0][1] < 1
def test_attach_custom_image(sagemaker_session): training_image = 'rl:latest' returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = \ Mock(name='describe_training_job', return_value=returned_job_description) estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_attach_custom_image(sagemaker_session): training_image = "rl:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3".format( RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version ) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"train_coach.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "train_coach.py" assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3'\ .format(RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"train_coach.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = \ Mock(name='describe_training_job', return_value=returned_job_description) estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'train_coach.py' assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
def test_coach_mxnet(sagemaker_session, rl_coach_full_version): estimator = _test_coach(sagemaker_session, RLFramework.MXNET, rl_coach_full_version) with timeout(minutes=15): estimator.fit(wait='False') estimator = RLEstimator.attach(estimator.latest_training_job.name, sagemaker_session=sagemaker_session) endpoint_name = 'test-mxnet-coach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(1, CPU_INSTANCE, entry_point='mxnet_deploy.py', endpoint_name=endpoint_name) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert 0 < action[0][0] < 1 assert 0 < action[0][1] < 1