def test_create_model_with_custom_image(name_from_base, sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" image = "selfdrivingcars:9000" rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, image_uri=image, container_log_level=container_log_level, source_dir=source_dir, ) job_name = "new_name" rl.fit(job_name=job_name) model_name = "model_name" name_from_base.return_value = model_name new_entry_point = "deploy_script.py" model = rl.create_model(entry_point=new_entry_point) assert model.sagemaker_session == sagemaker_session assert model.image_uri == image assert model.entry_point == new_entry_point assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir name_from_base.assert_called_with("selfdrivingcars")
def test_create_mxnet_model(sagemaker_session, rl_coach_mxnet_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' rl = RLEstimator(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=rl_coach_mxnet_version, framework=RLFramework.MXNET, container_log_level=container_log_level, source_dir=source_dir) job_name = 'new_name' rl.fit(inputs='s3://mybucket/train', job_name='new_name') model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][ RLFramework.MXNET.value] assert isinstance(model, MXNetModel) assert model.sagemaker_session == sagemaker_session assert model.framework_version == framework_version assert model.py_version == PYTHON_VERSION assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_attach_wrong_framework(sagemaker_session): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4" rjd = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_create_tf_model(name_from_base, sagemaker_session, coach_tensorflow_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=coach_tensorflow_version, framework=RLFramework.TENSORFLOW, container_log_level=container_log_level, source_dir=source_dir, ) rl.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[coach_tensorflow_version][RLFramework.TENSORFLOW.value] assert isinstance(model, TensorFlowModel) assert model.sagemaker_session == sagemaker_session assert model.framework_version == framework_version assert model.role == ROLE assert model.name == model_name assert model._container_log_level == container_log_level assert model.vpc_config is None call_args = name_from_base.call_args_list[0][0] assert call_args[0] in ("sagemaker-rl-tensorflow", "sagemaker-rl-coach-container")
def test_create_model_with_optional_params(sagemaker_session, coach_mxnet_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=coach_mxnet_version, framework=RLFramework.MXNET, container_log_level=container_log_level, source_dir=source_dir, ) rl.fit(job_name="new_name") new_role = "role" new_entry_point = "deploy_script.py" vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model_name = "model-name" model = rl.create_model( role=new_role, entry_point=new_entry_point, vpc_config_override=vpc_config, name=model_name ) assert model.role == new_role assert model.vpc_config == vpc_config assert model.entry_point == new_entry_point assert model.name == model_name
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' image = 'selfdrivingcars:9000' rl = RLEstimator(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=image, container_log_level=container_log_level, source_dir=source_dir) job_name = 'new_name' rl.fit(job_name=job_name) new_entry_point = 'deploy_script.py' model = rl.create_model(entry_point=new_entry_point) assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == new_entry_point assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_create_model_with_optional_params(sagemaker_session, rl_coach_mxnet_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' rl = RLEstimator(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=rl_coach_mxnet_version, framework=RLFramework.MXNET, container_log_level=container_log_level, source_dir=source_dir) rl.fit(job_name='new_name') new_role = 'role' new_entry_point = 'deploy_script.py' vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = rl.create_model(role=new_role, entry_point=new_entry_point, vpc_config_override=vpc_config) assert model.role == new_role assert model.vpc_config == vpc_config assert model.entry_point == new_entry_point
def test_create_tf_model(sagemaker_session, rl_coach_tf_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=rl_coach_tf_version, framework=RLFramework.TENSORFLOW, container_log_level=container_log_level, source_dir=source_dir, ) job_name = "new_name" rl.fit(inputs="s3://mybucket/train", job_name="new_name") model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_tf_version][ RLFramework.TENSORFLOW.value] assert isinstance(model, tfs.Model) assert model.sagemaker_session == sagemaker_session assert model._framework_version == framework_version assert model.role == ROLE assert model.name == job_name assert model._container_log_level == container_log_level assert model.vpc_config is None
def test_cartpole(docker_image, sagemaker_local_session, processor, tmpdir): source_dir = os.path.join(RESOURCE_PATH, 'coach_cartpole') dependencies = [os.path.join(RESOURCE_PATH, 'sagemaker_rl')] cartpole = 'train_coach.py' instance_type = 'local' if processor == 'cpu' else 'local_gpu' estimator = RLEstimator(entry_point=cartpole, source_dir=source_dir, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, image_name=docker_image, output_path='file://{}'.format(tmpdir), dependencies=dependencies, hyperparameters={ "save_model": 1, "RLCOACH_PRESET": "preset_cartpole_clippedppo", "rl.agent_params.algorithm.discount": 0.9, "rl.evaluation_steps:EnvironmentEpisodes": 1, }) estimator.fit() local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success']) assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
def test_attach_wrong_framework(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4' rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def _submit_job_request(self, estimator: RLEstimator) -> object: # By setting wait to false we don't block the current thread. estimator.fit(job_name=self._rlestimator_job_name, wait=False) job_name = estimator.latest_training_job.job_name self._rlestimator_job_name = job_name response = self._sm_client.describe_training_job( TrainingJobName=job_name) return response
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version): rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=rl_coach_mxnet_version, framework=RLFramework.MXNET, ) inputs = "s3://mybucket/train" rl.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job(RLToolkit.COACH.value, rl_coach_mxnet_version, RLFramework.MXNET.value) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][ RLFramework.MXNET.value] expected_image_base = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3" submit_dir = "s3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz".format( TIMESTAMP) assert { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": submit_dir, "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": expected_image_base.format(framework_version), "ModelDataUrl": "s3://m/m.tar.gz", } == model.prepare_container_def(GPU) assert "cpu" in model.prepare_container_def(CPU)["Image"]
def test_gym(sagemaker_session, ecr_image, instance_type, framework): resource_path = os.path.join(RESOURCE_PATH, 'gym') gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py' estimator = RLEstimator(entry_point=gym_script, source_dir=resource_path, role='SageMakerRole', instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image) with timeout(minutes=15): estimator.fit()
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version): rl = RLEstimator(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=rl_coach_mxnet_version, framework=RLFramework.MXNET) inputs = 's3://mybucket/train' rl.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(RLToolkit.COACH.value, rl_coach_mxnet_version, RLFramework.MXNET.value) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][ RLFramework.MXNET.value] expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3' submit_dir = 's3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz'.format( TIMESTAMP) assert { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': submit_dir, 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': expected_image_base.format(framework_version), 'ModelDataUrl': 's3://m/m.tar.gz' } == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image']
def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3".format( RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version ) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"train_coach.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "train_coach.py" assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
def test_ray(sagemaker_session, ecr_image, instance_type, framework): source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole') cartpole = 'train_ray_tf.py' if framework == 'tensorflow' else 'train_ray_torch.py' estimator = RLEstimator(entry_point=cartpole, source_dir=source_dir, role='SageMakerRole', instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image) with timeout(minutes=15): estimator.fit()
def test_ray_tf(sagemaker_session, ecr_image, instance_type): source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole') cartpole = 'train_ray.py' estimator = RLEstimator(entry_point=cartpole, source_dir=source_dir, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=15): estimator.fit()
def test_coach_mxnet(sagemaker_session, coach_mxnet_latest_version, cpu_instance_type): estimator = _test_coach( sagemaker_session, RLFramework.MXNET, coach_mxnet_latest_version, cpu_instance_type ) job_name = unique_name_from_base("test-coach-mxnet") with timeout(minutes=15): estimator.fit(wait="False", job_name=job_name) estimator = RLEstimator.attach( estimator.latest_training_job.name, sagemaker_session=sagemaker_session ) endpoint_name = "test-mxnet-coach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( 1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name ) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert 0 < action[0][0] < 1 assert 0 < action[0][1] < 1
def test_wrong_type_parameters(sagemaker_session): with pytest.raises(AttributeError) as e: RLEstimator(toolkit=RLToolkit.COACH, framework=RLFramework.TENSORFLOW, toolkit_version=RLEstimator.RAY_LATEST_VERSION, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert 'combination is not supported.' in str(e.value)
def test_attach_custom_image(sagemaker_session): training_image = 'rl:latest' returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = \ Mock(name='describe_training_job', return_value=returned_job_description) estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_attach_custom_image(sagemaker_session): training_image = "rl:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3'\ .format(RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"train_coach.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = \ Mock(name='describe_training_job', return_value=returned_job_description) estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'train_coach.py' assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
def test_vw_cb_explore(local_instance_type, sagemaker_local_session, docker_image, tmpdir, training_data_bandits, role): source_path = os.path.join(RESOURCE_PATH, 'vw') estimator = RLEstimator(entry_point="train_cb_explore.py", source_dir=source_path, role=role, train_instance_count=1, hyperparameters={"num_arms": 7}, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir), image_name=docker_image) estimator.fit(inputs=training_data_bandits) local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success']) local_mode_utils.assert_output_files_exist(str(tmpdir), 'model', ['vw.model', 'vw.metadata']) assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
def configure_estimator(self): self.estimator = RLEstimator( entry_point=const.entry_point, source_dir=const.source_dir, image_name=self.custom_image_name, dependencies=["common/"], role=self.sagemaker_role, train_instance_type=self.instance_type, train_instance_count=self.instance_pool_count, output_path=self.s3_output_path, base_job_name=self.job_name_prefix, metric_definitions=self.metric_definitions, train_max_run=self.job_duration_in_seconds, hyperparameters={ "s3_bucket": self.s3_bucket, "s3_prefix": self.s3_prefix, "aws_region": self.aws_region, "preset_s3_key": "%s/presets/preset.py" % self.s3_prefix, "model_metadata_s3_key": "%s/model_metadata.json" % self.s3_prefix, "environment_s3_key": "%s/environments/deepracer_racetrack_env.py" % self.s3_prefix, "batch_size": self.hyperparam_data['batch_size'], "num_epochs": self.hyperparam_data['optimization_epochs'], "beta_entropy": self.hyperparam_data['beta_entropy'], "lr": self.hyperparam_data['learning_rate'], "num_episodes_between_training": 20, "discount_factor": self.hyperparam_data['discount'] }, subnets=self.deepracer_subnets, security_group_ids=self.deepracer_security_groups, ) self.estimator.fit(wait=False) self.job_name = self.estimator.latest_training_job.job_name print("Training job: %s" % self.job_name)
def test_gym(local_instance_type, sagemaker_local_session, docker_image, tmpdir, framework): source_path = os.path.join(RESOURCE_PATH, 'gym') gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py' estimator = RLEstimator(entry_point=gym_script, source_dir=source_path, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir), image_name=docker_image) estimator.fit() local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success']) assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
def test_wrong_toolkit_format(sagemaker_session): with pytest.raises(ValueError) as e: RLEstimator(toolkit='coach', framework=RLFramework.TENSORFLOW, toolkit_version=RLEstimator.COACH_LATEST_VERSION, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=None) assert 'Invalid type' in str(e.value)
def test_missing_required_parameters(sagemaker_session): with pytest.raises(AttributeError) as e: RLEstimator(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert 'Please provide `toolkit`, `toolkit_version`, `framework`' + \ ' or `image_name` parameter.' in str(e.value)
def test_ray_tf(local_instance_type, sagemaker_local_session, docker_image, tmpdir): source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole') cartpole = 'train_ray.py' estimator = RLEstimator(entry_point=cartpole, source_dir=source_dir, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir), image_name=docker_image) estimator.fit() local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success']) assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
def test_ray_tf(sagemaker_session, ray_tensorflow_latest_version, cpu_instance_type): source_dir = os.path.join(DATA_DIR, "ray_cartpole") cartpole = "train_ray.py" estimator = RLEstimator( entry_point=cartpole, source_dir=source_dir, toolkit=RLToolkit.RAY, framework=RLFramework.TENSORFLOW, toolkit_version=ray_tensorflow_latest_version, sagemaker_session=sagemaker_session, role="SageMakerRole", instance_type=cpu_instance_type, instance_count=1, ) job_name = unique_name_from_base("test-ray-tf") with timeout(minutes=15): estimator.fit(job_name=job_name) with pytest.raises(NotImplementedError) as e: estimator.deploy(1, cpu_instance_type) assert "Automatic deployment of Ray models is not currently available" in str( e.value)
def test_create_mxnet_model(name_from_base, sagemaker_session, coach_mxnet_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" rl = RLEstimator( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, toolkit=RLToolkit.COACH, toolkit_version=coach_mxnet_version, framework=RLFramework.MXNET, container_log_level=container_log_level, source_dir=source_dir, ) rl.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = rl.create_model() supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[coach_mxnet_version][ RLFramework.MXNET.value] assert isinstance(model, MXNetModel) assert model.sagemaker_session == sagemaker_session assert model.framework_version == framework_version assert model.py_version == PYTHON_VERSION assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None name_from_base.assert_called_with("sagemaker-rl-mxnet")