Exemple #1
0
def test_create_mxnet_model(sagemaker_session, rl_coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=rl_coach_mxnet_version,
        framework=RLFramework.MXNET,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    job_name = "new_name"
    rl.fit(inputs="s3://mybucket/train", job_name="new_name")
    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    assert isinstance(model, MXNetModel)
    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == framework_version
    assert model.py_version == PYTHON_VERSION
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None
Exemple #2
0
def test_cartpole(docker_image, sagemaker_local_session, processor, tmpdir):
    source_dir = os.path.join(RESOURCE_PATH, 'coach_cartpole')
    dependencies = [os.path.join(RESOURCE_PATH, 'sagemaker_rl')]
    cartpole = 'train_coach.py'

    instance_type = 'local' if processor == 'cpu' else 'local_gpu'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            instance_count=1,
                            instance_type=instance_type,
                            sagemaker_session=sagemaker_local_session,
                            image_uri=docker_image,
                            output_path='file://{}'.format(tmpdir),
                            dependencies=dependencies,
                            hyperparameters={
                                "save_model": 1,
                                "RLCOACH_PRESET": "preset_cartpole_clippedppo",
                                "rl.agent_params.algorithm.discount": 0.9,
                                "rl.evaluation_steps:EnvironmentEpisodes": 1,
                            })
    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
def test_ray_tf(sagemaker_session, ray_tensorflow_latest_version,
                cpu_instance_type):
    source_dir = os.path.join(DATA_DIR, "ray_cartpole")
    cartpole = "train_ray.py"

    estimator = RLEstimator(
        entry_point=cartpole,
        source_dir=source_dir,
        toolkit=RLToolkit.RAY,
        framework=RLFramework.TENSORFLOW,
        toolkit_version=ray_tensorflow_latest_version,
        sagemaker_session=sagemaker_session,
        role="SageMakerRole",
        instance_type=cpu_instance_type,
        instance_count=1,
    )
    job_name = unique_name_from_base("test-ray-tf")

    with timeout(minutes=15):
        estimator.fit(job_name=job_name)

    with pytest.raises(NotImplementedError) as e:
        estimator.deploy(1, cpu_instance_type)
    assert "Automatic deployment of Ray models is not currently available" in str(
        e.value)
Exemple #4
0
def test_create_model_with_custom_image(name_from_base, sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    image = "selfdrivingcars:9000"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        image_uri=image,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    job_name = "new_name"
    rl.fit(job_name=job_name)

    model_name = "model_name"
    name_from_base.return_value = model_name
    new_entry_point = "deploy_script.py"
    model = rl.create_model(entry_point=new_entry_point)

    assert model.sagemaker_session == sagemaker_session
    assert model.image_uri == image
    assert model.entry_point == new_entry_point
    assert model.role == ROLE
    assert model.name == model_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir

    name_from_base.assert_called_with("selfdrivingcars")
def test_wrong_type_parameters(sagemaker_session):
    with pytest.raises(AttributeError) as e:
        RLEstimator(toolkit=RLToolkit.COACH, framework=RLFramework.TENSORFLOW,
                    toolkit_version=RLEstimator.RAY_LATEST_VERSION,
                    entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)
    assert 'combination is not supported.' in str(e.value)
Exemple #6
0
def test_create_model_with_optional_params(sagemaker_session,
                                           coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=coach_mxnet_version,
        framework=RLFramework.MXNET,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    rl.fit(job_name="new_name")

    new_role = "role"
    new_entry_point = "deploy_script.py"
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model_name = "model-name"
    model = rl.create_model(role=new_role,
                            entry_point=new_entry_point,
                            vpc_config_override=vpc_config,
                            name=model_name)

    assert model.role == new_role
    assert model.vpc_config == vpc_config
    assert model.entry_point == new_entry_point
    assert model.name == model_name
Exemple #7
0
def test_create_tf_model(name_from_base, sagemaker_session, coach_tensorflow_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=coach_tensorflow_version,
        framework=RLFramework.TENSORFLOW,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    rl.fit(inputs="s3://mybucket/train", job_name="new_name")

    model_name = "model_name"
    name_from_base.return_value = model_name
    model = rl.create_model()

    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[coach_tensorflow_version][RLFramework.TENSORFLOW.value]

    assert isinstance(model, TensorFlowModel)
    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == framework_version
    assert model.role == ROLE
    assert model.name == model_name
    assert model._container_log_level == container_log_level
    assert model.vpc_config is None

    call_args = name_from_base.call_args_list[0][0]
    assert call_args[0] in ("sagemaker-rl-tensorflow", "sagemaker-rl-coach-container")
Exemple #8
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    image = 'selfdrivingcars:9000'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     image_name=image,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    job_name = 'new_name'
    rl.fit(job_name=job_name)
    new_entry_point = 'deploy_script.py'
    model = rl.create_model(entry_point=new_entry_point)

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == new_entry_point
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
Exemple #9
0
def test_create_model_with_optional_params(sagemaker_session,
                                           rl_coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_mxnet_version,
                     framework=RLFramework.MXNET,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    rl.fit(job_name='new_name')

    new_role = 'role'
    new_entry_point = 'deploy_script.py'
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = rl.create_model(role=new_role,
                            entry_point=new_entry_point,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.vpc_config == vpc_config
    assert model.entry_point == new_entry_point
Exemple #10
0
def test_create_tf_model(sagemaker_session, rl_coach_tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_tf_version,
                     framework=RLFramework.TENSORFLOW,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    job_name = 'new_name'
    rl.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_tf_version][
        RLFramework.TENSORFLOW.value]

    assert isinstance(model, tfs.Model)
    assert model.sagemaker_session == sagemaker_session
    assert model._framework_version == framework_version
    assert model.role == ROLE
    assert model.name == job_name
    assert model._container_log_level == container_log_level
    assert model.vpc_config is None
Exemple #11
0
def test_missing_required_parameters(sagemaker_session):
    with pytest.raises(AttributeError) as e:
        RLEstimator(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE)
    assert 'Please provide `toolkit`, `toolkit_version`, `framework`' + \
           ' or `image_name` parameter.' in str(e.value)
def test_wrong_toolkit_format(sagemaker_session):
    with pytest.raises(ValueError) as e:
        RLEstimator(toolkit='coach', framework=RLFramework.TENSORFLOW,
                    toolkit_version=RLEstimator.COACH_LATEST_VERSION,
                    entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=None)

    assert 'Invalid type' in str(e.value)
Exemple #13
0
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version):
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=rl_coach_mxnet_version,
        framework=RLFramework.MXNET,
    )

    inputs = "s3://mybucket/train"

    rl.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(RLToolkit.COACH.value,
                                            rl_coach_mxnet_version,
                                            RLFramework.MXNET.value)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    expected_image_base = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3"
    submit_dir = "s3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz".format(
        TIMESTAMP)
    assert {
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY": submit_dir,
            "SAGEMAKER_PROGRAM": "dummy_script.py",
            "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
            "SAGEMAKER_REGION": "us-west-2",
            "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
        },
        "Image": expected_image_base.format(framework_version),
        "ModelDataUrl": "s3://m/m.tar.gz",
    } == model.prepare_container_def(GPU)

    assert "cpu" in model.prepare_container_def(CPU)["Image"]
Exemple #14
0
def test_gym(sagemaker_session, ecr_image, instance_type, framework):
    resource_path = os.path.join(RESOURCE_PATH, 'gym')
    gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py'
    estimator = RLEstimator(entry_point=gym_script,
                            source_dir=resource_path,
                            role='SageMakerRole',
                            instance_count=1,
                            instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_uri=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
def _rl_estimator(sagemaker_session, toolkit=RLToolkit.COACH,
                  toolkit_version=RLEstimator.COACH_LATEST_VERSION, framework=RLFramework.MXNET,
                  train_instance_type=None, base_job_name=None, **kwargs):
    return RLEstimator(entry_point=SCRIPT_PATH,
                       toolkit=toolkit,
                       toolkit_version=toolkit_version,
                       framework=framework,
                       role=ROLE,
                       sagemaker_session=sagemaker_session,
                       train_instance_count=INSTANCE_COUNT,
                       train_instance_type=train_instance_type or INSTANCE_TYPE,
                       base_job_name=base_job_name,
                       **kwargs)
Exemple #16
0
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version):
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_mxnet_version,
                     framework=RLFramework.MXNET)

    inputs = 's3://mybucket/train'

    rl.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(RLToolkit.COACH.value,
                                            rl_coach_mxnet_version,
                                            RLFramework.MXNET.value)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3'
    submit_dir = 's3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz'.format(
        TIMESTAMP)
    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': submit_dir,
            'SAGEMAKER_PROGRAM': 'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
            'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': expected_image_base.format(framework_version),
        'ModelDataUrl': 's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
Exemple #17
0
def test_ray(sagemaker_session, ecr_image, instance_type, framework):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray_tf.py' if framework == 'tensorflow' else 'train_ray_torch.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            instance_count=1,
                            instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_uri=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
def test_ray_tf(sagemaker_session, ecr_image, instance_type):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
Exemple #19
0
def test_wrong_framework_format(sagemaker_session):
    with pytest.raises(ValueError) as e:
        RLEstimator(
            toolkit=RLToolkit.RAY,
            framework="TF",
            toolkit_version=RLEstimator.RAY_LATEST_VERSION,
            entry_point=SCRIPT_PATH,
            role=ROLE,
            sagemaker_session=sagemaker_session,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            framework_version=None,
        )

    assert "Invalid type" in str(e.value)
Exemple #20
0
    def configure_estimator(self):
        self.estimator = RLEstimator(
            entry_point=const.entry_point,
            source_dir=const.source_dir,
            image_name=self.custom_image_name,
            dependencies=["common/"],
            role=self.sagemaker_role,
            train_instance_type=self.instance_type,
            train_instance_count=self.instance_pool_count,
            output_path=self.s3_output_path,
            base_job_name=self.job_name_prefix,
            metric_definitions=self.metric_definitions,
            train_max_run=self.job_duration_in_seconds,
            hyperparameters={
                "s3_bucket":
                self.s3_bucket,
                "s3_prefix":
                self.s3_prefix,
                "aws_region":
                self.aws_region,
                "preset_s3_key":
                "%s/presets/preset.py" % self.s3_prefix,
                "model_metadata_s3_key":
                "%s/model_metadata.json" % self.s3_prefix,
                "environment_s3_key":
                "%s/environments/deepracer_racetrack_env.py" % self.s3_prefix,
                "batch_size":
                self.hyperparam_data['batch_size'],
                "num_epochs":
                self.hyperparam_data['optimization_epochs'],
                "beta_entropy":
                self.hyperparam_data['beta_entropy'],
                "lr":
                self.hyperparam_data['learning_rate'],
                "num_episodes_between_training":
                20,
                "discount_factor":
                self.hyperparam_data['discount']
            },
            subnets=self.deepracer_subnets,
            security_group_ids=self.deepracer_security_groups,
        )

        self.estimator.fit(wait=False)
        self.job_name = self.estimator.latest_training_job.job_name
        print("Training job: %s" % self.job_name)
Exemple #21
0
def test_vw_cb_explore(local_instance_type, sagemaker_local_session, docker_image,
                       tmpdir, training_data_bandits, role):
    source_path = os.path.join(RESOURCE_PATH, 'vw')
    estimator = RLEstimator(entry_point="train_cb_explore.py",
                            source_dir=source_path,
                            role=role,
                            train_instance_count=1,
                            hyperparameters={"num_arms": 7},
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)
    estimator.fit(inputs=training_data_bandits)

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success'])
    local_mode_utils.assert_output_files_exist(str(tmpdir), 'model', ['vw.model', 'vw.metadata'])    
    assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
def test_gym(local_instance_type, sagemaker_local_session, docker_image,
             tmpdir, framework):
    source_path = os.path.join(RESOURCE_PATH, 'gym')
    gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py'
    estimator = RLEstimator(entry_point=gym_script,
                            source_dir=source_path,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)
    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
Exemple #23
0
    def _create_job_request(
        self,
        inputs: SageMakerRLEstimatorInputs,
        outputs: SageMakerRLEstimatorOutputs,
    ) -> RLEstimator:
        # Documentation: https://sagemaker.readthedocs.io/en/stable/frameworks/rl/sagemaker.rl.html
        # We need to configure region and it is not something we can do via the RLEstimator class.

        # Only use max wait time default value if electing to use spot instances
        if not inputs.spot_instance:
            max_wait_time = None
        else:
            max_wait_time = inputs.max_wait_time

        estimator = RLEstimator(
            entry_point=inputs.entry_point,
            source_dir=inputs.source_dir,
            image_uri=inputs.image,
            toolkit=self._get_toolkit(inputs.toolkit),
            toolkit_version=inputs.toolkit_version,
            framework=self._get_framework(inputs.framework),
            role=inputs.role,
            debugger_hook_config=self._nullable(inputs.debug_hook_config),
            rules=self._nullable(inputs.debug_rule_config),
            instance_type=inputs.instance_type,
            instance_count=inputs.instance_count,
            output_path=inputs.model_artifact_path,
            metric_definitions=inputs.metric_definitions,
            input_mode=inputs.training_input_mode,
            max_run=inputs.max_run,
            hyperparameters=self._validate_hyperparameters(
                inputs.hyperparameters),
            subnets=self._nullable(inputs.vpc_subnets),
            security_group_ids=self._nullable(inputs.vpc_security_group_ids),
            use_spot_instances=inputs.spot_instance,
            enable_network_isolation=inputs.network_isolation,
            encrypt_inter_container_traffic=inputs.traffic_encryption,
            max_wait=max_wait_time,
            sagemaker_session=self._sagemaker_session,
        )

        return estimator
def test_ray_tf(sagemaker_session, rl_ray_full_version):
    source_dir = os.path.join(DATA_DIR, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            toolkit=RLToolkit.RAY,
                            framework=RLFramework.TENSORFLOW,
                            toolkit_version=rl_ray_full_version,
                            sagemaker_session=sagemaker_session,
                            role='SageMakerRole',
                            train_instance_type=CPU_INSTANCE,
                            train_instance_count=1)

    with timeout(minutes=15):
        estimator.fit()

    with pytest.raises(NotImplementedError) as e:
        estimator.deploy(1, CPU_INSTANCE)
    assert 'Automatic deployment of Ray models is not currently available' in str(e.value)
Exemple #25
0
def test_ray_tf(local_instance_type, sagemaker_local_session, docker_image,
                tmpdir):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)

    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
def _test_coach(sagemaker_session, rl_framework, rl_coach_version):
    source_dir = os.path.join(DATA_DIR, 'coach_cartpole')
    dependencies = [os.path.join(DATA_DIR, 'sagemaker_rl')]
    cartpole = 'train_coach.py'

    return RLEstimator(toolkit=RLToolkit.COACH,
                       toolkit_version=rl_coach_version,
                       framework=rl_framework,
                       entry_point=cartpole,
                       source_dir=source_dir,
                       role='SageMakerRole',
                       train_instance_count=1,
                       train_instance_type=CPU_INSTANCE,
                       sagemaker_session=sagemaker_session,
                       dependencies=dependencies,
                       hyperparameters={
                           "save_model": 1,
                           "RLCOACH_PRESET": "preset_cartpole_clippedppo",
                           "rl.agent_params.algorithm.discount": 0.9,
                           "rl.evaluation_steps:EnvironmentEpisodes": 1,
                       })
Exemple #27
0
def test_coach(sagemaker_session, ecr_image, instance_type):
    source_dir = os.path.join(RESOURCE_PATH, 'coach_cartpole')
    dependencies = [os.path.join(RESOURCE_PATH, 'sagemaker_rl')]
    cartpole = 'train_coach.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image,
                            dependencies=dependencies,
                            hyperparameters={
                                "save_model": 1,
                                "RLCOACH_PRESET": "preset_cartpole_clippedppo",
                                "rl.agent_params.algorithm.discount": 0.9,
                                "rl.evaluation_steps:EnvironmentEpisodes": 1,
                            })

    with timeout(minutes=15):
        estimator.fit()
Exemple #28
0
estimator = RLEstimator(
    entry_point="training_worker.py",
    source_dir='src',
    dependencies=["common/sagemaker_rl"],
    toolkit=RLToolkit.COACH,
    toolkit_version='0.11',
    framework=RLFramework.TENSORFLOW,
    sagemaker_session=sage_session,
    #bypass sagemaker SDK validation of the role
    role="aaa/",
    train_instance_type=instance_type,
    train_instance_count=1,
    output_path=s3_output_path,
    base_job_name=job_name,
    image_name=image_name,
    train_max_run=job_duration_in_seconds,  # Maximum runtime in seconds
    hyperparameters={
        "s3_bucket":
        s3_bucket,
        "s3_prefix":
        s3_prefix,
        "aws_region":
        aws_region,
        "model_metadata_s3_key":
        "s3://{}/custom_files/model_metadata.json".format(s3_bucket),
        "RLCOACH_PRESET":
        RLCOACH_PRESET,
        "batch_size":
        64,
        "beta_entropy":
        0.01,
        "discount_factor":
        0.999,
        "e_greedy_value":
        0.05,
        "epsilon_steps":
        10000,
        "exploration_type":
        "categorical",
        "loss_type":
        "mean squared error",
        "lr":
        0.0003,
        "num_episodes_between_training":
        20,
        "num_epochs":
        10,
        "stack_size":
        1,
        "term_cond_avg_score":
        100000.0,
        "term_cond_max_episodes":
        100000

        #"pretrained_s3_bucket": "{}".format(s3_bucket),
        #"pretrained_s3_prefix": "rl-deepracer-pretrained"
    },
    metric_definitions=metric_definitions,
    s3_client=s3Client
    #subnets=default_subnets, # Required for VPC mode
    #security_group_ids=default_security_groups, # Required for VPC mode
)
if pretrained == True:
    hyperparameters_core['pretrained_s3_bucket'] = "{}".format(
        s3_pretrained_bucket)
    hyperparameters_core['pretrained_s3_prefix'] = s3_pretrained_prefix

# Downloading the hyperparameter file from our local bucket.
hyperparameter_data = io.BytesIO()
s3Client.download_fileobj(
    s3_bucket, hyperparameter_file, hyperparameter_data)
hyperparameters_nn = json.loads(hyperparameter_data.getvalue().decode("utf-8"))
hyperparameters = {**hyperparameters_core, **hyperparameters_nn}
print("Configured following hyperparameters")
print(hyperparameters)
estimator = RLEstimator(entry_point="training_worker.py",
                        source_dir='markov',
                        dependencies=["common/sagemaker_rl","markov"],
                        sagemaker_session=sage_session,
                        # bypass sagemaker SDK validation of the role
                        role="aaa/",
                        train_instance_type=instance_type,
                        train_instance_count=1,
                        output_path=s3_output_path,
                        base_job_name=job_name,
                        image_name=image_name,
                        train_max_run=job_duration_in_seconds,  # Maximum runtime in seconds
                        hyperparameters=hyperparameters,
                        metric_definitions=RLEstimator.default_metric_definitions(RLToolkit.COACH)
                        )

estimator.fit(job_name=job_name, wait=False)
Exemple #30
0
if hyper["pretrained"].lower() == "true":
    hyperparameters.update({
        "pretrained_s3_bucket": "{}".format(s3_bucket),
        "pretrained_s3_prefix": "rl-deepracer-pretrained"
    })

estimator = RLEstimator(
    entry_point="training_worker.py",
    source_dir='src',
    dependencies=["common/sagemaker_rl"],
    toolkit=RLToolkit.COACH,
    toolkit_version='0.11',
    framework=RLFramework.TENSORFLOW,
    sagemaker_session=sage_session,
    #bypass sagemaker SDK validation of the role
    role="aaa/",
    train_instance_type=instance_type,
    train_instance_count=1,
    output_path=s3_output_path,
    base_job_name=job_name,
    image_name=image_name,
    train_max_run=job_duration_in_seconds,  # Maximum runtime in seconds
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    s3_client=s3Client
    #subnets=default_subnets, # Required for VPC mode
    #security_group_ids=default_security_groups, # Required for VPC mode
)

estimator.fit(job_name=job_name, wait=False)