def test_attach_wrong_framework(sagemaker_session): rjd = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4", }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def _test_mnist_train(sagemaker_session, ecr_image, instance_type, instance_count, script): source_dir = 'test/resources/mnist' with timeout(minutes=15): data_path = 'test/resources/mnist/data' chainer = Chainer(entry_point=script, source_dir=source_dir, role='SageMakerRole', train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={ 'batch-size': 10000, 'epochs': 1 }) prefix = 'chainer_mnist/{}'.format(sagemaker_timestamp()) train_data_path = os.path.join(data_path, 'train') key_prefix = prefix + '/train' train_input = sagemaker_session.upload_data(path=train_data_path, key_prefix=key_prefix) test_path = os.path.join(data_path, 'test') test_input = sagemaker_session.upload_data(path=test_path, key_prefix=prefix + '/test') chainer.fit({'train': train_input, 'test': test_input})
def test_attach_wrong_framework(sagemaker_session): rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_create_model(name_from_base, sagemaker_session, chainer_version, chainer_py_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" base_job_name = "job" chainer = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, framework_version=chainer_version, container_log_level=container_log_level, py_version=chainer_py_version, base_job_name=base_job_name, source_dir=source_dir, ) chainer.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = chainer.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == chainer_version assert model.py_version == chainer.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None name_from_base.assert_called_with(base_job_name)
def test_create_model(sagemaker_session, chainer_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=chainer_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'new_name' chainer.fit(inputs='s3://mybucket/train', job_name='new_name') model = chainer.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == chainer_version assert model.py_version == chainer.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_create_model(sagemaker_session, chainer_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" chainer = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=chainer_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, ) job_name = "new_name" chainer.fit(inputs="s3://mybucket/train", job_name=job_name) model = chainer.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == chainer_version assert model.py_version == chainer.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_attach_wrong_framework(sagemaker_session): rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" chainer = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) chainer.fit(inputs="s3://mybucket/train", job_name="new_name") new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model = chainer.create_model( role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, ) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) chainer.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = chainer.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_train_image_default(sagemaker_session): chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION) assert _get_full_cpu_image_uri( defaults.CHAINER_VERSION) in chainer.train_image()
def test_chainer_mnist_distributed(docker_image, sagemaker_local_session, instance_type, customer_script, tmpdir): if instance_type == 'local_gpu': pytest.skip('Local Mode does not support distributed GPU training.') # pure_nccl communicator hangs when only one gpu is available. cluster_size = 2 hyperparameters = { 'sagemaker_process_slots_per_host': 1, 'sagemaker_num_processes': cluster_size, 'batch-size': 10000, 'epochs': 1, 'communicator': 'hierarchical' } estimator = Chainer(entry_point=customer_script, source_dir=mnist_path, role=role, image_name=docker_image, train_instance_count=cluster_size, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, hyperparameters=hyperparameters, output_path='file://{}'.format(tmpdir)) estimator.fit({ 'train': 'file://{}'.format(os.path.join(data_dir, 'train')), 'test': 'file://{}'.format(os.path.join(data_dir, 'test')) }) success_files = { 'model': ['model.npz'], 'output': [ 'success', 'data/accuracy.png', 'data/cg.dot', 'data/log', 'data/loss.png' ], } test_utils.files_exist(str(tmpdir), success_files) request_data = np.zeros((100, 784), dtype='float32') test_utils.predict_and_assert_response_length(estimator, request_data, instance_type) test_utils.predict_and_assert_response_length(estimator, request_data, instance_type, json_serializer, json_deserializer, 'application/json') test_utils.predict_and_assert_response_length(estimator, request_data, instance_type, csv_serializer, csv_deserializer, 'text/csv')
def test_chainer_mnist_single_machine(docker_image, sagemaker_local_session, instance_type, tmpdir): customer_script = 'single_machine_customer_script.py' hyperparameters = {'batch-size': 10000, 'epochs': 1} estimator = Chainer(entry_point=customer_script, source_dir=mnist_path, role=role, image_name=docker_image, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, hyperparameters=hyperparameters, output_path='file://{}'.format(tmpdir)) estimator.fit({ 'train': 'file://{}'.format(os.path.join(data_dir, 'train')), 'test': 'file://{}'.format(os.path.join(data_dir, 'test')) }) success_files = { 'model': ['model.npz'], 'output': [ 'success', 'data/accuracy.png', 'data/cg.dot', 'data/log', 'data/loss.png' ], } test_utils.files_exist(str(tmpdir), success_files) request_data = np.zeros((100, 784), dtype='float32') test_utils.predict_and_assert_response_length(estimator, request_data, instance_type) test_utils.predict_and_assert_response_length(estimator, request_data, instance_type, csv_serializer, csv_deserializer, 'text/csv') test_arrays = [ np.zeros((100, 784), dtype='float32'), np.zeros((100, 1, 28, 28), dtype='float32'), np.zeros((100, 28, 28), dtype='float32') ] with test_utils.local_mode_lock(): try: predictor = _json_predictor(estimator, instance_type) for array in test_arrays: response = predictor.predict(array) assert len(response) == len(array) finally: predictor.delete_endpoint()
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'ubuntu:latest' chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir) chainer.fit(inputs='s3://mybucket/train', job_name='new_name') model = chainer.create_model() assert model.image == custom_image
def test_training_image_default(sagemaker_session, chainer_version, chainer_py_version): chainer = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, framework_version=chainer_version, py_version=chainer_py_version, ) assert ( _get_full_cpu_image_uri(chainer_version, chainer_py_version) == chainer.training_image_uri() )
def _chainer_estimator(sagemaker_session, framework_version=defaults.CHAINER_VERSION, train_instance_type=None, enable_cloudwatch_metrics=False, base_job_name=None, use_mpi=None, num_processes=None, process_slots_per_host=None, additional_mpi_options=None, **kwargs): return Chainer(entry_point=SCRIPT_PATH, framework_version=framework_version, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=train_instance_type if train_instance_type else INSTANCE_TYPE, enable_cloudwatch_metrics=enable_cloudwatch_metrics, base_job_name=base_job_name, use_mpi=use_mpi, num_processes=num_processes, process_slots_per_host=process_slots_per_host, additional_mpi_options=additional_mpi_options, py_version=PYTHON_VERSION, **kwargs)
def test_attach_custom_image(sagemaker_session): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.image_uri == training_image assert estimator.training_image_uri() == training_image
def test_single_machine_failure(docker_image, instance_type, sagemaker_local_session, tmpdir): customer_script = 'failure_script.py' estimator = Chainer(entry_point=customer_script, source_dir=resource_path, role=role, image_name=docker_image, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, output_path='file://{}'.format(tmpdir)) with pytest.raises(RuntimeError): estimator.fit() failure_files = {'output': ['failure', os.path.join('data', 'this_file_is_expected')]} test_utils.files_exist(str(tmpdir), failure_files)
def test_attach_custom_image(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest' returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_attach_custom_image(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_chainer_image:latest' returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image
def _chainer_estimator( sagemaker_session, framework_version, py_version, instance_type=None, base_job_name=None, use_mpi=None, num_processes=None, process_slots_per_host=None, additional_mpi_options=None, **kwargs ): return Chainer( entry_point=SCRIPT_PATH, framework_version=framework_version, py_version=py_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=instance_type if instance_type else INSTANCE_TYPE, base_job_name=base_job_name, use_mpi=use_mpi, num_processes=num_processes, process_slots_per_host=process_slots_per_host, additional_mpi_options=additional_mpi_options, **kwargs )
def test_empty_framework_version(warning, sagemaker_session): estimator = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=None) assert estimator.framework_version == defaults.CHAINER_VERSION warning.assert_called_with(defaults.CHAINER_VERSION, Chainer.LATEST_VERSION)
def test_chainer_airflow_config_uploads_data_source_to_s3( sagemaker_session, cpu_instance_type, chainer_full_version ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type="local", framework_version=chainer_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO", ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") training_config = _build_airflow_workflow( estimator=chainer, instance_type=cpu_instance_type, inputs={"train": train_input, "test": test_input}, ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'), )
def test_attach_with_additional_hyperparameters(sagemaker_session, chainer_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}".format( chainer_version, PYTHON_VERSION) returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": training_image }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "sagemaker_region": '"us-west-2"', "sagemaker_num_processes": "4", "sagemaker_additional_mpi_options": '"-x MY_ENVIRONMENT_VARIABLE"', "sagemaker_process_slots_per_host": "10", "sagemaker_use_mpi": "true", }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) estimator = Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert bool(estimator.hyperparameters()["sagemaker_use_mpi"]) assert int(estimator.hyperparameters()["sagemaker_num_processes"]) == 4 assert int( estimator.hyperparameters()["sagemaker_process_slots_per_host"]) == 10 assert (str( estimator.hyperparameters()["sagemaker_additional_mpi_options"]) == '"-x MY_ENVIRONMENT_VARIABLE"') assert estimator.use_mpi assert estimator.num_processes == 4 assert estimator.process_slots_per_host == 10 assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) chainer.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 model = chainer.create_model(role=new_role, model_server_workers=model_server_workers) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_attach_with_additional_hyperparameters(sagemaker_session, chainer_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format( chainer_version, PYTHON_VERSION) returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'sagemaker_region': '"us-west-2"', 'sagemaker_num_processes': '4', 'sagemaker_additional_mpi_options': '"-x MY_ENVIRONMENT_VARIABLE"', 'sagemaker_process_slots_per_host': '10', 'sagemaker_use_mpi': 'true' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert bool(estimator.hyperparameters()['sagemaker_use_mpi']) assert int(estimator.hyperparameters()['sagemaker_num_processes']) == 4 assert int( estimator.hyperparameters()['sagemaker_process_slots_per_host']) == 10 assert str(estimator.hyperparameters()['sagemaker_additional_mpi_options'] ) == '\"-x MY_ENVIRONMENT_VARIABLE\"' assert estimator.use_mpi assert estimator.num_processes == 4 assert estimator.process_slots_per_host == 10 assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
def test_attach(sagemaker_session, chainer_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}".format( chainer_version, PYTHON_VERSION) returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": training_image }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) estimator = Chainer.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == chainer_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py"
def test_attach(sagemaker_session, chainer_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format( chainer_version, PYTHON_VERSION) returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == chainer_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "ubuntu:latest" chainer = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, image_uri=custom_image, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) chainer.fit(inputs="s3://mybucket/train", job_name="new_name") model = chainer.create_model() assert model.image_uri == custom_image
def test_create_model(sagemaker_session, chainer_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=chainer_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name='job', source_dir=source_dir) job_name = 'new_name' chainer.fit(inputs='s3://mybucket/train', job_name=job_name) model = chainer.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == chainer_version assert model.py_version == chainer.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def _test_mnist(sagemaker_session, ecr_image, instance_type, instance_count, script): source_dir = 'test/resources/mnist' with timeout(minutes=15): data_path = 'test/resources/mnist/data' chainer = Chainer(entry_point=script, source_dir=source_dir, role='SageMakerRole', train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={ 'batch-size': 10000, 'epochs': 1 }) prefix = 'chainer_mnist/{}'.format(sagemaker_timestamp()) train_data_path = os.path.join(data_path, 'train') key_prefix = prefix + '/train' train_input = sagemaker_session.upload_data(path=train_data_path, key_prefix=key_prefix) test_path = os.path.join(data_path, 'test') test_input = sagemaker_session.upload_data(path=test_path, key_prefix=prefix + '/test') chainer.fit({'train': train_input, 'test': test_input}) with timeout_and_delete_endpoint(estimator=chainer, minutes=30): predictor = chainer.deploy(initial_instance_count=1, instance_type=instance_type) batch_size = 100 data = np.zeros(shape=(batch_size, 1, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size
def test_tuning_chainer(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') estimator = Chainer(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': r'\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) tuning_job_name = unique_name_from_base('chainer', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name) print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') batch_size = 100 data = np.zeros((batch_size, 784), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 1, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size
def test_estimator_py2_warning(warning, sagemaker_session): estimator = Chainer( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version="py2", ) assert estimator.py_version == "py2" warning.assert_called_with(estimator.__framework_name__, defaults.LATEST_PY2_VERSION)
def test_chainer_mnist_custom_loop(docker_image, sagemaker_local_session, instance_type, tmpdir): customer_script = 'single_machine_custom_loop.py' hyperparameters = {'batch-size': 10000, 'epochs': 1} estimator = Chainer(entry_point=customer_script, source_dir=mnist_path, role=role, image_name=docker_image, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, hyperparameters=hyperparameters, output_path='file://{}'.format(tmpdir)) estimator.fit({ 'train': 'file://{}'.format(os.path.join(data_dir, 'train')), 'test': 'file://{}'.format(os.path.join(data_dir, 'test')) }) success_files = { 'model': ['model.npz'], 'output': ['success'], } test_utils.files_exist(str(tmpdir), success_files) request_data = np.zeros((100, 784), dtype='float32') test_utils.predict_and_assert_response_length(estimator, request_data, instance_type) test_utils.predict_and_assert_response_length(estimator, request_data, instance_type, json_serializer, json_deserializer, 'application/json') test_utils.predict_and_assert_response_length(estimator, request_data, instance_type, csv_serializer, csv_deserializer, 'text/csv')
def test_all_processes_finish_with_mpi(docker_image, sagemaker_local_session, tmpdir): """ This test validates that all training processes finish before containers are shut down. """ customer_script = 'all_processes_finish_customer_script.py' hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 2, 'sagemaker_num_processes': 4} estimator = Chainer(entry_point=customer_script, source_dir=resource_path, role=role, image_name=docker_image, train_instance_count=2, train_instance_type='local', sagemaker_session=sagemaker_local_session, hyperparameters=hyperparameters, output_path='file://{}'.format(tmpdir)) estimator.fit() completion_file = {'output': [os.path.join('data', 'algo-2', 'process_could_complete')]} test_utils.files_exist(str(tmpdir), completion_file)
def test_chainer(strftime, sagemaker_session, chainer_version): chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION, framework_version=chainer_version) inputs = 's3://mybucket/train' chainer.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(chainer_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = chainer.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-gpu-{}' assert {'Environment': {'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-chainer-{}/source/sourcedir.tar.gz'.format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'}, 'Image': expected_image_base.format(chainer_version, PYTHON_VERSION), 'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = chainer.deploy(1, GPU) assert isinstance(predictor, ChainerPredictor)
def test_attach(sagemaker_session, chainer_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(chainer_version, PYTHON_VERSION) returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == chainer_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_attach_with_additional_hyperparameters(sagemaker_session, chainer_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chainer:{}-cpu-{}'.format(chainer_version, PYTHON_VERSION) returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'sagemaker_region': '"us-west-2"', 'sagemaker_num_processes': '4', 'sagemaker_additional_mpi_options': '"-x MY_ENVIRONMENT_VARIABLE"', 'sagemaker_process_slots_per_host': '10', 'sagemaker_use_mpi': 'true' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = Chainer.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert bool(estimator.hyperparameters()['sagemaker_use_mpi']) assert int(estimator.hyperparameters()['sagemaker_num_processes']) == 4 assert int(estimator.hyperparameters()['sagemaker_process_slots_per_host']) == 10 assert str(estimator.hyperparameters()['sagemaker_additional_mpi_options']) == '\"-x MY_ENVIRONMENT_VARIABLE\"' assert estimator.use_mpi assert estimator.num_processes == 4 assert estimator.process_slots_per_host == 10 assert estimator.additional_mpi_options == "-x MY_ENVIRONMENT_VARIABLE"
def test_train_image_default(sagemaker_session): chainer = Chainer(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION) assert _get_full_cpu_image_uri(defaults.CHAINER_VERSION) in chainer.train_image()