def test_attach_wrong_framework(sagemaker_session):
    rjd = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4",
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "checkpoint_path": '"s3://other/1508872349"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=rjd
    )

    with pytest.raises(ValueError) as error:
        Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
Esempio n. 2
0
def _test_mnist_train(sagemaker_session, ecr_image, instance_type,
                      instance_count, script):
    source_dir = 'test/resources/mnist'

    with timeout(minutes=15):
        data_path = 'test/resources/mnist/data'

        chainer = Chainer(entry_point=script,
                          source_dir=source_dir,
                          role='SageMakerRole',
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={
                              'batch-size': 10000,
                              'epochs': 1
                          })

        prefix = 'chainer_mnist/{}'.format(sagemaker_timestamp())

        train_data_path = os.path.join(data_path, 'train')

        key_prefix = prefix + '/train'
        train_input = sagemaker_session.upload_data(path=train_data_path,
                                                    key_prefix=key_prefix)

        test_path = os.path.join(data_path, 'test')
        test_input = sagemaker_session.upload_data(path=test_path,
                                                   key_prefix=prefix + '/test')

        chainer.fit({'train': train_input, 'test': test_input})
def test_attach_wrong_framework(sagemaker_session):
    rjd = {'AlgorithmSpecification':
           {'TrainingInputMode': 'File',
            'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'},
           'HyperParameters':
               {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                'checkpoint_path': '"s3://other/1508872349"',
                'sagemaker_program': '"iris-dnn-classifier.py"',
                'sagemaker_enable_cloudwatch_metrics': 'false',
                'sagemaker_container_log_level': '"logging.INFO"',
                'training_steps': '100',
                'sagemaker_region': '"us-west-2"'},
           'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
           'ResourceConfig':
               {'VolumeSizeInGB': 30,
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge'},
           'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
           'TrainingJobName': 'neo',
           'TrainingJobStatus': 'Completed',
           'OutputDataConfig': {'KmsKeyId': '',
                                'S3OutputPath': 's3://place/output/neo'},
           'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    with pytest.raises(ValueError) as error:
        Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_create_model(name_from_base, sagemaker_session, chainer_version, chainer_py_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    base_job_name = "job"

    chainer = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        framework_version=chainer_version,
        container_log_level=container_log_level,
        py_version=chainer_py_version,
        base_job_name=base_job_name,
        source_dir=source_dir,
    )

    chainer.fit(inputs="s3://mybucket/train", job_name="new_name")

    model_name = "model_name"
    name_from_base.return_value = model_name
    model = chainer.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == chainer_version
    assert model.py_version == chainer.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == model_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None

    name_from_base.assert_called_with(base_job_name)
Esempio n. 5
0
def test_create_model(sagemaker_session, chainer_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    chainer = Chainer(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      framework_version=chainer_version,
                      container_log_level=container_log_level,
                      py_version=PYTHON_VERSION,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'new_name'
    chainer.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = chainer.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == chainer_version
    assert model.py_version == chainer.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_create_model(sagemaker_session, chainer_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    chainer = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=chainer_version,
        container_log_level=container_log_level,
        py_version=PYTHON_VERSION,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "new_name"
    chainer.fit(inputs="s3://mybucket/train", job_name=job_name)
    model = chainer.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == chainer_version
    assert model.py_version == chainer.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None
Esempio n. 7
0
def test_attach_wrong_framework(sagemaker_session):
    rjd = {'AlgorithmSpecification':
           {'TrainingInputMode': 'File',
            'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'},
           'HyperParameters':
               {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                'checkpoint_path': '"s3://other/1508872349"',
                'sagemaker_program': '"iris-dnn-classifier.py"',
                'sagemaker_enable_cloudwatch_metrics': 'false',
                'sagemaker_container_log_level': '"logging.INFO"',
                'training_steps': '100',
                'sagemaker_region': '"us-west-2"'},
           'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
           'ResourceConfig':
               {'VolumeSizeInGB': 30,
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge'},
           'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
           'TrainingJobName': 'neo',
           'TrainingJobStatus': 'Completed',
           'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo',
           'OutputDataConfig': {'KmsKeyId': '',
                                'S3OutputPath': 's3://place/output/neo'},
           'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    with pytest.raises(ValueError) as error:
        Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    enable_cloudwatch_metrics = "true"
    chainer = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        py_version=PYTHON_VERSION,
        base_job_name="job",
        source_dir=source_dir,
        enable_cloudwatch_metrics=enable_cloudwatch_metrics,
    )

    chainer.fit(inputs="s3://mybucket/train", job_name="new_name")

    new_role = "role"
    model_server_workers = 2
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model = chainer.create_model(
        role=new_role,
        model_server_workers=model_server_workers,
        vpc_config_override=vpc_config,
        entry_point=SERVING_SCRIPT_FILE,
    )

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
    assert model.entry_point == SERVING_SCRIPT_FILE
Esempio n. 9
0
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    chainer = Chainer(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level,
                      py_version=PYTHON_VERSION,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    chainer.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = chainer.create_model(role=new_role,
                                 model_server_workers=model_server_workers,
                                 vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
Esempio n. 10
0
def test_train_image_default(sagemaker_session):
    chainer = Chainer(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      py_version=PYTHON_VERSION)

    assert _get_full_cpu_image_uri(
        defaults.CHAINER_VERSION) in chainer.train_image()
Esempio n. 11
0
def test_chainer_mnist_distributed(docker_image, sagemaker_local_session,
                                   instance_type, customer_script, tmpdir):
    if instance_type == 'local_gpu':
        pytest.skip('Local Mode does not support distributed GPU training.')

    # pure_nccl communicator hangs when only one gpu is available.
    cluster_size = 2
    hyperparameters = {
        'sagemaker_process_slots_per_host': 1,
        'sagemaker_num_processes': cluster_size,
        'batch-size': 10000,
        'epochs': 1,
        'communicator': 'hierarchical'
    }

    estimator = Chainer(entry_point=customer_script,
                        source_dir=mnist_path,
                        role=role,
                        image_name=docker_image,
                        train_instance_count=cluster_size,
                        train_instance_type=instance_type,
                        sagemaker_session=sagemaker_local_session,
                        hyperparameters=hyperparameters,
                        output_path='file://{}'.format(tmpdir))

    estimator.fit({
        'train': 'file://{}'.format(os.path.join(data_dir, 'train')),
        'test': 'file://{}'.format(os.path.join(data_dir, 'test'))
    })

    success_files = {
        'model': ['model.npz'],
        'output': [
            'success', 'data/accuracy.png', 'data/cg.dot', 'data/log',
            'data/loss.png'
        ],
    }

    test_utils.files_exist(str(tmpdir), success_files)

    request_data = np.zeros((100, 784), dtype='float32')

    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type)
    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type,
                                                  json_serializer,
                                                  json_deserializer,
                                                  'application/json')
    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type,
                                                  csv_serializer,
                                                  csv_deserializer, 'text/csv')
Esempio n. 12
0
def test_chainer_mnist_single_machine(docker_image, sagemaker_local_session,
                                      instance_type, tmpdir):
    customer_script = 'single_machine_customer_script.py'
    hyperparameters = {'batch-size': 10000, 'epochs': 1}

    estimator = Chainer(entry_point=customer_script,
                        source_dir=mnist_path,
                        role=role,
                        image_name=docker_image,
                        train_instance_count=1,
                        train_instance_type=instance_type,
                        sagemaker_session=sagemaker_local_session,
                        hyperparameters=hyperparameters,
                        output_path='file://{}'.format(tmpdir))

    estimator.fit({
        'train': 'file://{}'.format(os.path.join(data_dir, 'train')),
        'test': 'file://{}'.format(os.path.join(data_dir, 'test'))
    })

    success_files = {
        'model': ['model.npz'],
        'output': [
            'success', 'data/accuracy.png', 'data/cg.dot', 'data/log',
            'data/loss.png'
        ],
    }
    test_utils.files_exist(str(tmpdir), success_files)

    request_data = np.zeros((100, 784), dtype='float32')

    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type)
    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type,
                                                  csv_serializer,
                                                  csv_deserializer, 'text/csv')

    test_arrays = [
        np.zeros((100, 784), dtype='float32'),
        np.zeros((100, 1, 28, 28), dtype='float32'),
        np.zeros((100, 28, 28), dtype='float32')
    ]

    with test_utils.local_mode_lock():
        try:
            predictor = _json_predictor(estimator, instance_type)
            for array in test_arrays:
                response = predictor.predict(array)
                assert len(response) == len(array)
        finally:
            predictor.delete_endpoint()
Esempio n. 13
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'ubuntu:latest'
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      image_name=custom_image, container_log_level=container_log_level,
                      py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir)

    chainer.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = chainer.create_model()

    assert model.image == custom_image
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'ubuntu:latest'
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      image_name=custom_image, container_log_level=container_log_level,
                      py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir)

    chainer.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = chainer.create_model()

    assert model.image == custom_image
def test_training_image_default(sagemaker_session, chainer_version, chainer_py_version):
    chainer = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        framework_version=chainer_version,
        py_version=chainer_py_version,
    )

    assert (
        _get_full_cpu_image_uri(chainer_version, chainer_py_version) == chainer.training_image_uri()
    )
Esempio n. 16
0
def _chainer_estimator(sagemaker_session,
                       framework_version=defaults.CHAINER_VERSION,
                       train_instance_type=None,
                       enable_cloudwatch_metrics=False,
                       base_job_name=None,
                       use_mpi=None,
                       num_processes=None,
                       process_slots_per_host=None,
                       additional_mpi_options=None,
                       **kwargs):
    return Chainer(entry_point=SCRIPT_PATH,
                   framework_version=framework_version,
                   role=ROLE,
                   sagemaker_session=sagemaker_session,
                   train_instance_count=INSTANCE_COUNT,
                   train_instance_type=train_instance_type
                   if train_instance_type else INSTANCE_TYPE,
                   enable_cloudwatch_metrics=enable_cloudwatch_metrics,
                   base_job_name=base_job_name,
                   use_mpi=use_mpi,
                   num_processes=num_processes,
                   process_slots_per_host=process_slots_per_host,
                   additional_mpi_options=additional_mpi_options,
                   py_version=PYTHON_VERSION,
                   **kwargs)
def test_attach_custom_image(sagemaker_session):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest"
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.image_uri == training_image
    assert estimator.training_image_uri() == training_image
def test_single_machine_failure(docker_image, instance_type, sagemaker_local_session, tmpdir):
    customer_script = 'failure_script.py'
    estimator = Chainer(entry_point=customer_script,
                        source_dir=resource_path,
                        role=role,
                        image_name=docker_image,
                        train_instance_count=1,
                        train_instance_type=instance_type,
                        sagemaker_session=sagemaker_local_session,
                        output_path='file://{}'.format(tmpdir))

    with pytest.raises(RuntimeError):
        estimator.fit()

    failure_files = {'output': ['failure', os.path.join('data', 'this_file_is_expected')]}
    test_utils.files_exist(str(tmpdir), failure_files)
def test_attach_custom_image(sagemaker_session):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest'
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
Esempio n. 20
0
def test_attach_custom_image(sagemaker_session):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest'
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
def _chainer_estimator(
    sagemaker_session,
    framework_version,
    py_version,
    instance_type=None,
    base_job_name=None,
    use_mpi=None,
    num_processes=None,
    process_slots_per_host=None,
    additional_mpi_options=None,
    **kwargs
):
    return Chainer(
        entry_point=SCRIPT_PATH,
        framework_version=framework_version,
        py_version=py_version,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=instance_type if instance_type else INSTANCE_TYPE,
        base_job_name=base_job_name,
        use_mpi=use_mpi,
        num_processes=num_processes,
        process_slots_per_host=process_slots_per_host,
        additional_mpi_options=additional_mpi_options,
        **kwargs
    )
Esempio n. 22
0
def test_empty_framework_version(warning, sagemaker_session):
    estimator = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                        framework_version=None)

    assert estimator.framework_version == defaults.CHAINER_VERSION
    warning.assert_called_with(defaults.CHAINER_VERSION, Chainer.LATEST_VERSION)
def test_chainer_airflow_config_uploads_data_source_to_s3(
    sagemaker_session, cpu_instance_type, chainer_full_version
):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type="local",
            framework_version=chainer_full_version,
            py_version=PYTHON_VERSION,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
            use_mpi=True,
            num_processes=2,
            process_slots_per_host=2,
            additional_mpi_options="-x NCCL_DEBUG=INFO",
        )

        train_input = "file://" + os.path.join(data_path, "train")
        test_input = "file://" + os.path.join(data_path, "test")

        training_config = _build_airflow_workflow(
            estimator=chainer,
            instance_type=cpu_instance_type,
            inputs={"train": train_input, "test": test_input},
        )

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'),
        )
Esempio n. 24
0
def test_attach_with_additional_hyperparameters(sagemaker_session,
                                                chainer_version):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}".format(
        chainer_version, PYTHON_VERSION)
    returned_job_description = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": training_image
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training":
            '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "sagemaker_region": '"us-west-2"',
            "sagemaker_num_processes": "4",
            "sagemaker_additional_mpi_options": '"-x MY_ENVIRONMENT_VARIABLE"',
            "sagemaker_process_slots_per_host": "10",
            "sagemaker_use_mpi": "true",
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 24 * 60 * 60
        },
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {
            "KmsKeyId": "",
            "S3OutputPath": "s3://place/output/neo"
        },
        "TrainingJobOutput": {
            "S3TrainingJobOutput": "s3://here/output.tar.gz"
        },
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name="neo",
                               sagemaker_session=sagemaker_session)
    assert bool(estimator.hyperparameters()["sagemaker_use_mpi"])
    assert int(estimator.hyperparameters()["sagemaker_num_processes"]) == 4
    assert int(
        estimator.hyperparameters()["sagemaker_process_slots_per_host"]) == 10
    assert (str(
        estimator.hyperparameters()["sagemaker_additional_mpi_options"]) ==
            '"-x MY_ENVIRONMENT_VARIABLE"')
    assert estimator.use_mpi
    assert estimator.num_processes == 4
    assert estimator.process_slots_per_host == 10
    assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job',
                      source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    chainer.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    model = chainer.create_model(role=new_role, model_server_workers=model_server_workers)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
Esempio n. 26
0
def test_attach_with_additional_hyperparameters(sagemaker_session,
                                                chainer_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(
        chainer_version, PYTHON_VERSION)
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_s3_uri_training':
            '"sagemaker-3/integ-test-data/tf_iris"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'sagemaker_region': '"us-west-2"',
            'sagemaker_num_processes': '4',
            'sagemaker_additional_mpi_options': '"-x MY_ENVIRONMENT_VARIABLE"',
            'sagemaker_process_slots_per_host': '10',
            'sagemaker_use_mpi': 'true'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo',
                               sagemaker_session=sagemaker_session)
    assert bool(estimator.hyperparameters()['sagemaker_use_mpi'])
    assert int(estimator.hyperparameters()['sagemaker_num_processes']) == 4
    assert int(
        estimator.hyperparameters()['sagemaker_process_slots_per_host']) == 10
    assert str(estimator.hyperparameters()['sagemaker_additional_mpi_options']
               ) == '\"-x MY_ENVIRONMENT_VARIABLE\"'
    assert estimator.use_mpi
    assert estimator.num_processes == 4
    assert estimator.process_slots_per_host == 10
    assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
Esempio n. 27
0
def test_attach(sagemaker_session, chainer_version):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}".format(
        chainer_version, PYTHON_VERSION)
    returned_job_description = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": training_image
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training":
            '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 24 * 60 * 60
        },
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {
            "KmsKeyId": "",
            "S3OutputPath": "s3://place/output/neo"
        },
        "TrainingJobOutput": {
            "S3TrainingJobOutput": "s3://here/output.tar.gz"
        },
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name="neo",
                               sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == chainer_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters()["training_steps"] == "100"
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "iris-dnn-classifier.py"
Esempio n. 28
0
def test_attach(sagemaker_session, chainer_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(
        chainer_version, PYTHON_VERSION)
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_s3_uri_training':
            '"sagemaker-3/integ-test-data/tf_iris"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'training_steps': '100',
            'sagemaker_region': '"us-west-2"'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo',
                               sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == chainer_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    custom_image = "ubuntu:latest"
    chainer = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        image_uri=custom_image,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    chainer.fit(inputs="s3://mybucket/train", job_name="new_name")
    model = chainer.create_model()

    assert model.image_uri == custom_image
def test_create_model(sagemaker_session, chainer_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      framework_version=chainer_version, container_log_level=container_log_level,
                      py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir)

    job_name = 'new_name'
    chainer.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = chainer.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == chainer_version
    assert model.py_version == chainer.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
def _test_mnist(sagemaker_session, ecr_image, instance_type, instance_count,
                script):
    source_dir = 'test/resources/mnist'

    with timeout(minutes=15):
        data_path = 'test/resources/mnist/data'

        chainer = Chainer(entry_point=script,
                          source_dir=source_dir,
                          role='SageMakerRole',
                          train_instance_count=instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={
                              'batch-size': 10000,
                              'epochs': 1
                          })

        prefix = 'chainer_mnist/{}'.format(sagemaker_timestamp())

        train_data_path = os.path.join(data_path, 'train')

        key_prefix = prefix + '/train'
        train_input = sagemaker_session.upload_data(path=train_data_path,
                                                    key_prefix=key_prefix)

        test_path = os.path.join(data_path, 'test')
        test_input = sagemaker_session.upload_data(path=test_path,
                                                   key_prefix=prefix + '/test')

        chainer.fit({'train': train_input, 'test': test_input})

    with timeout_and_delete_endpoint(estimator=chainer, minutes=30):
        predictor = chainer.deploy(initial_instance_count=1,
                                   instance_type=instance_type)

        batch_size = 100
        data = np.zeros(shape=(batch_size, 1, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size
Esempio n. 32
0
def test_tuning_chainer(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        estimator = Chainer(entry_point=script_path,
                            role='SageMakerRole',
                            py_version=PYTHON_VERSION,
                            train_instance_count=1,
                            train_instance_type='ml.c4.xlarge',
                            sagemaker_session=sagemaker_session,
                            hyperparameters={'epochs': 1})

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/chainer_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/chainer_mnist/test')

        hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)}

        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [
            {'Name': 'Validation-accuracy',
             'Regex': r'\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}]

        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
                                    metric_definitions,
                                    max_jobs=2, max_parallel_jobs=2)

        tuning_job_name = unique_name_from_base('chainer', max_length=32)
        tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name)

        print('Started hyperparameter tuning job with name:' + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')

        batch_size = 100
        data = np.zeros((batch_size, 784), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 1, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size
def test_estimator_py2_warning(warning, sagemaker_session):
    estimator = Chainer(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        py_version="py2",
    )

    assert estimator.py_version == "py2"
    warning.assert_called_with(estimator.__framework_name__, defaults.LATEST_PY2_VERSION)
Esempio n. 34
0
def test_chainer_mnist_custom_loop(docker_image, sagemaker_local_session,
                                   instance_type, tmpdir):
    customer_script = 'single_machine_custom_loop.py'
    hyperparameters = {'batch-size': 10000, 'epochs': 1}

    estimator = Chainer(entry_point=customer_script,
                        source_dir=mnist_path,
                        role=role,
                        image_name=docker_image,
                        train_instance_count=1,
                        train_instance_type=instance_type,
                        sagemaker_session=sagemaker_local_session,
                        hyperparameters=hyperparameters,
                        output_path='file://{}'.format(tmpdir))

    estimator.fit({
        'train': 'file://{}'.format(os.path.join(data_dir, 'train')),
        'test': 'file://{}'.format(os.path.join(data_dir, 'test'))
    })

    success_files = {
        'model': ['model.npz'],
        'output': ['success'],
    }

    test_utils.files_exist(str(tmpdir), success_files)

    request_data = np.zeros((100, 784), dtype='float32')

    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type)
    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type,
                                                  json_serializer,
                                                  json_deserializer,
                                                  'application/json')
    test_utils.predict_and_assert_response_length(estimator, request_data,
                                                  instance_type,
                                                  csv_serializer,
                                                  csv_deserializer, 'text/csv')
def test_all_processes_finish_with_mpi(docker_image, sagemaker_local_session, tmpdir):
    """
    This test validates that all training processes finish before containers are shut down.
    """
    customer_script = 'all_processes_finish_customer_script.py'
    hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 2,
                       'sagemaker_num_processes': 4}

    estimator = Chainer(entry_point=customer_script,
                        source_dir=resource_path,
                        role=role,
                        image_name=docker_image,
                        train_instance_count=2,
                        train_instance_type='local',
                        sagemaker_session=sagemaker_local_session,
                        hyperparameters=hyperparameters,
                        output_path='file://{}'.format(tmpdir))

    estimator.fit()

    completion_file = {'output': [os.path.join('data', 'algo-2', 'process_could_complete')]}
    test_utils.files_exist(str(tmpdir), completion_file)
def test_chainer(strftime, sagemaker_session, chainer_version):
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION,
                      framework_version=chainer_version)

    inputs = 's3://mybucket/train'

    chainer.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(chainer_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = chainer.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-gpu-{}'
    assert {'Environment':
            {'SAGEMAKER_SUBMIT_DIRECTORY':
             's3://mybucket/sagemaker-chainer-{}/source/sourcedir.tar.gz'.format(TIMESTAMP),
             'SAGEMAKER_PROGRAM': 'dummy_script.py',
             'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
             'SAGEMAKER_REGION': 'us-west-2',
             'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'},
            'Image': expected_image_base.format(chainer_version, PYTHON_VERSION),
            'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = chainer.deploy(1, GPU)
    assert isinstance(predictor, ChainerPredictor)
def test_attach(sagemaker_session, chainer_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(chainer_version,
                                                                                            PYTHON_VERSION)
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == chainer_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_attach_with_additional_hyperparameters(sagemaker_session, chainer_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(chainer_version,
                                                                                            PYTHON_VERSION)
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'sagemaker_region': '"us-west-2"',
                                     'sagemaker_num_processes': '4',
                                     'sagemaker_additional_mpi_options': '"-x MY_ENVIRONMENT_VARIABLE"',
                                     'sagemaker_process_slots_per_host': '10',
                                     'sagemaker_use_mpi': 'true'
                                     },
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert bool(estimator.hyperparameters()['sagemaker_use_mpi'])
    assert int(estimator.hyperparameters()['sagemaker_num_processes']) == 4
    assert int(estimator.hyperparameters()['sagemaker_process_slots_per_host']) == 10
    assert str(estimator.hyperparameters()['sagemaker_additional_mpi_options']) == '\"-x MY_ENVIRONMENT_VARIABLE\"'
    assert estimator.use_mpi
    assert estimator.num_processes == 4
    assert estimator.process_slots_per_host == 10
    assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
def test_train_image_default(sagemaker_session):
    chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION)

    assert _get_full_cpu_image_uri(defaults.CHAINER_VERSION) in chainer.train_image()