def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call, access, socket, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) popen().poll.side_effect = [True, False] tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) popen.assert_any_call([ 'tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006' ], stderr=-1, stdout=-1) popen.assert_any_call([ 'tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6007' ], stderr=-1, stdout=-1)
def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir): output_path = 'file://%s' % tmpdir job_name = sagemaker.utils.unique_name_from_base('tf-horovod') estimator = TensorFlow(entry_point=os.path.join(horovod_dir, 'test_hvd_basic.py'), role='SageMakerRole', train_instance_count=2, train_instance_type='local', sagemaker_session=sagemaker_local_session, py_version=integ.PYTHON_VERSION, script_mode=True, output_path=output_path, framework_version='1.12', distributions={ 'mpi': { 'enabled': True, 'processes_per_host': processes } }) with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(job_name=job_name) tmp = str(tmpdir) extract_files(output_path.replace('file://', ''), tmp) size = instances * processes for rank in range(size): assert read_json('rank-%s' % rank, tmp)['rank'] == rank
def test_create_model(sagemaker_session, tf_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == tf_version assert model.py_version == tf.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def _run_distributed_training_horovod_basic( instances, processes, sagemaker_local_session, docker_image, tmpdir, framework_version ): output_path = "file://%s" % tmpdir estimator = TensorFlow( entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"), role="SageMakerRole", train_instance_type="local", sagemaker_session=sagemaker_local_session, train_instance_count=instances, image_name=docker_image, output_path=output_path, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_network_interface_name": "eth0", "sagemaker_mpi_num_of_processes_per_host": processes, }, ) estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed"))) tmp = str(tmpdir) extract_files(output_path.replace("file://", ""), tmp) size = instances * processes for rank in range(size): local_rank = rank % processes assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == { "local-rank": local_rank, "rank": rank, "size": size, }
def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', hyperparameters={ # Saving a checkpoint after every 5 steps to hammer the S3 plugin 'save-checkpoint-steps': 10, # Disable throttling for checkpoint and model saving 'throttle-secs': 0, # Without the patch training jobs would fail around 100th to # 150th step 'max-steps': 200, # Large batch size would result in a larger checkpoint file 'batch-size': 1024, # This makes the training job exporting model during training. # Stale model garbage collection will also be performed. 'export-model-during-training': True }, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region), job_name=unique_name_from_base('test-tf-sm-s3-mnist')) _assert_s3_file_exists(region, estimator.model_data) _assert_checkpoint_exists(region, estimator.model_dir, 200)
def test_distributed_training_horovod( sagemaker_session, instance_type, image_uri, tmpdir, framework_version ): mpi_options = "-verbose -x orte_base_help_aggregate=0" estimator = TensorFlow( entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"), role="SageMakerRole", train_instance_type=instance_type, train_instance_count=2, image_name=image_uri, framework_version=framework_version, py_version="py3", script_mode=True, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_mpi_custom_mpi_options": mpi_options, "sagemaker_mpi_num_of_processes_per_host": 1, }, sagemaker_session=sagemaker_session, ) estimator.fit(job_name=unique_name_from_base("test-tf-horovod")) model_data_source = sagemaker.local.data.get_data_source_instance( estimator.model_data, sagemaker_session ) for filename in model_data_source.get_file_list(): assert os.path.basename(filename) == "model.tar.gz"
def test_tf_script_mode(time, strftime, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, py_version='py3', train_instance_type=INSTANCE_TYPE, train_instance_count=1, framework_version='1.11', source_dir=DATA_DIR) inputs = 's3://mybucket/train' tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job('1.11', script_mode=True, repo_name=SM_IMAGE_REPO_NAME, py_version='py3') expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = tf.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py', source_dir='code', role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local', framework_version='2.4.1', py_version='py37') inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'} california_housing_estimator.fit(inputs) print('Completed model training') print('Deploying endpoint in local mode') predictor = california_housing_estimator.deploy(initial_instance_count=1, instance_type='local') do_inference_on_local_endpoint(predictor) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_deploy_with_input_handlers(sagemaker_session, instance_type): estimator = TensorFlow( entry_point="training.py", source_dir=TFS_RESOURCE_PATH, role=ROLE, train_instance_count=1, train_instance_type=instance_type, py_version=tests.integ.PYTHON_VERSION, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, tags=TAGS, ) estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy")) endpoint_name = estimator.latest_training_job.name with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name, entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"), ) input_data = {"instances": [1.0, 2.0, 5.0]} expected_result = {"predictions": [4.0, 4.5, 6.0]} result = predictor.predict(input_data) assert expected_result == result
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py', source_dir='code', role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local', framework_version='2.4.1', py_version='py37') inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'} california_housing_estimator.fit(inputs) print('Completed model training') print('Running Batch Transform in local mode') tensorflow_serving_transformer = california_housing_estimator.transformer( instance_count=1, instance_type='local', output_path='file:./data/output', ) tensorflow_serving_transformer.transform('file://./data/input', split_type='Line', content_type='text/csv') print('Printing Batch Transform output file content') output_file = open('./data/output/x_test.csv.out', 'r').read() print(output_file)
def test_tf_local_data_local_script(): with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=LocalNoS3Session()) inputs = 'file://' + DATA_PATH estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with local_mode_utils.lock(): try: json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint()
def test_keras_training(sagemaker_local_session, docker_image, tmpdir, framework_version): entry_point = os.path.join(RESOURCE_PATH, 'keras_inception.py') output_path = 'file://{}'.format(tmpdir) estimator = TensorFlow(entry_point=entry_point, role='SageMakerRole', train_instance_count=1, train_instance_type='local', image_name=docker_image, sagemaker_session=sagemaker_local_session, model_dir='/opt/ml/model', output_path=output_path, framework_version=framework_version, py_version='py3') estimator.fit() model = serving.Model(model_data=output_path, role='SageMakerRole', framework_version=framework_version, sagemaker_session=sagemaker_local_session) predictor = model.deploy(initial_instance_count=1, instance_type='local') assert predictor.predict(np.random.randn(4, 4, 4, 2) * 255) predictor.delete_endpoint()
def test_distributed_training_horovod(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version): mpi_options = '-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1' estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'), role='SageMakerRole', instance_type=instance_type, instance_count=2, image_uri=ecr_image, framework_version=framework_version, py_version='py3', hyperparameters={ 'sagemaker_mpi_enabled': True, 'sagemaker_mpi_custom_mpi_options': mpi_options, 'sagemaker_mpi_num_of_processes_per_host': 1 }, sagemaker_session=sagemaker_session) estimator.fit(job_name=unique_name_from_base('test-tf-horovod')) model_data_source = sagemaker.local.data.get_data_source_instance( estimator.model_data, sagemaker_session) for filename in model_data_source.get_file_list(): assert os.path.basename(filename) == 'model.tar.gz'
def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=tests.integ.PYTHON_VERSION, script_mode=True, framework_version=tf_full_version, distributions=PARAMETER_SERVER_DISTRIBUTION, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/distributed_mnist") with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-distributed")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
def test_server_side_encryption(sagemaker_session): boto_session = sagemaker_session.boto_session with kms_utils.bucket_with_encryption(boto_session, ROLE) as (bucket_with_kms, kms_key): output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption', time.strftime('%y%m%d-%H%M')) estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type='ml.c5.xlarge', sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, code_location=output_path, output_path=output_path, model_dir='/opt/ml/model', output_kms_key=kms_key) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit( inputs=inputs, job_name=unique_name_from_base('test-server-side-encryption'))
def test_smdataparallel_tf_mnist( sagemaker_session, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): job_name = sagemaker.utils.unique_name_from_base( "tf-sm-distributed-dataparallel") estimator = TensorFlow( entry_point="mnist_tf.py", role="SageMakerRole", source_dir=smdataparallel_dir, instance_count=2, instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, distribution={"smdistributed": { "dataparallel": { "enabled": True } }}, ) with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(job_name=job_name)
def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow(entry_point=SCRIPT, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, metric_definitions=[{ 'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)' }]) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-mnist')) _assert_s3_files_exist( estimator.model_dir, ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']) df = estimator.training_job_analytics.dataframe() print(df) assert df.size > 0
def test_mnist_async(sagemaker_session): estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type='ml.c5.4xlarge', sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, tags=TAGS) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') estimator.fit(inputs=inputs, wait=False, job_name=unique_name_from_base('test-tf-sm-async')) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(np.zeros(784)) print('predict result: {}'.format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS) _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
def test_estimator_deploy(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "custom:1.0" tf = TensorFlow( entry_point="script.py", role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) job_name = "doing something" tf.fit(inputs="s3://mybucket/train", job_name=job_name) predictor = tf.deploy(INSTANCE_COUNT, INSTANCE_TYPE, endpoint_name="endpoint", endpoint_type="tensorflow-serving") assert isinstance(predictor, Predictor)
def test_tf_script_mode_mpi(time, strftime, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, py_version='py3', train_instance_type=INSTANCE_TYPE, train_instance_count=1, framework_version='1.11', source_dir=DATA_DIR, distributions=DISTRIBUTION_MPI_ENABLED) inputs = 's3://mybucket/train' tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job('1.11', script_mode=True, horovod=True, repo_name=SM_IMAGE_REPO_NAME, py_version='py3') expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs expected_train_args['hyperparameters'][ TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True) expected_train_args['hyperparameters'][ TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2) expected_train_args['hyperparameters'][ TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps('options') actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args
def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=tests.integ.PYTHON_VERSION, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist")) _assert_s3_files_exist( estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], sagemaker_session.boto_region_name, ) df = estimator.training_job_analytics.dataframe() assert df.size > 0
def test_failed_tf_training(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME) estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, subnets=[subnet], security_group_ids=[security_group_id]) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure') with pytest.raises(ValueError) as e: estimator.fit(inputs) assert 'This failure is expected' in str(e.value) job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=estimator.latest_training_job.name) assert [subnet] == job_desc['VpcConfig']['Subnets'] assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
def test_tf(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_tf_async(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def run_tf_training(script, instance_type, instance_count, sagemaker_local_session, docker_image, framework_version, training_data_path, output_path=None, hyperparameters=None): hyperparameters = hyperparameters or {} estimator = TensorFlow(entry_point=script, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_local_session, image_uri=docker_image, model_dir='/opt/ml/model', output_path=output_path, hyperparameters=hyperparameters, base_job_name='test-tf', framework_version=framework_version, py_version='py3') estimator.fit(training_data_path)
def test_cifar(sagemaker_session, tf_full_version): with timeout(minutes=45): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=500, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, train_max_run=45 * 60, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs, logs=False) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir): output_path = "file://%s" % tmpdir job_name = sagemaker.utils.unique_name_from_base("tf-horovod") estimator = TensorFlow( entry_point=os.path.join(horovod_dir, "hvd_basic.py"), role="SageMakerRole", train_instance_count=2, train_instance_type="local", sagemaker_session=sagemaker_local_session, py_version=integ.PYTHON_VERSION, script_mode=True, output_path=output_path, framework_version="1.12", distributions={"mpi": {"enabled": True, "processes_per_host": processes}}, ) with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(job_name=job_name) tmp = str(tmpdir) extract_files(output_path.replace("file://", ""), tmp) size = instances * processes for rank in range(size): assert read_json("rank-%s" % rank, tmp)["rank"] == rank
def test_cifar(sagemaker_session, tf_full_version): with timeout(minutes=45): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=500, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, train_max_run=20 * 60, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs, logs=False) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_fit_mpi(time, strftime, sagemaker_session): tf = TensorFlow( entry_point=SCRIPT_FILE, framework_version="1.11", py_version="py2", role=ROLE, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, instance_count=1, source_dir=DATA_DIR, distribution=DISTRIBUTION_MPI_ENABLED, ) inputs = "s3://mybucket/train" tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ["train", "logs_for_job"] expected_train_args = _create_train_job("1.11", horovod=True, py_version="py2") expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["hyperparameters"][ TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True) expected_train_args["hyperparameters"][ TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2) expected_train_args["hyperparameters"][ TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps("options") actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args
def test_server_side_encryption(sagemaker_session): boto_session = sagemaker_session.boto_session with tests.integ.kms_utils.bucket_with_encryption(boto_session, ROLE) as ( bucket_with_kms, kms_key, ): output_path = os.path.join( bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M") ) estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type="ml.c5.xlarge", sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=tests.integ.PYTHON_VERSION, code_location=output_path, output_path=output_path, model_dir="/opt/ml/model", output_kms_key=kms_key, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit( inputs=inputs, job_name=unique_name_from_base("test-server-side-encryption") )
def test_run_tensorboard_locally_without_awscli_binary(time, strftime, popen, call, access, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) with pytest.raises(EnvironmentError) as error: tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) assert str(error.value) == 'The AWS CLI is not installed in the system. Please install the AWS CLI using the ' \ 'following command: \n pip install awscli'
def test_run_tensorboard_locally(sleep, time, strftime, popen, call, access, rmtree, mkdtemp, sync, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) popen().poll.return_value = None tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) popen.assert_called_with(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'], stderr=-1, stdout=-1)
def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call, access, socket, rmtree, mkdtemp, sync, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) popen().poll.side_effect = [-1, None] tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'], stderr=-1, stdout=-1) popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6007'], stderr=-1, stdout=-1)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'tensorflow:1.0' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.image == custom_image
def test_failed_tf_training(sagemaker_session, tf_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure') with pytest.raises(ValueError) as e: estimator.fit(inputs) assert 'This failure is expected' in str(e.value)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 model = tf.create_model(role=new_role, model_server_workers=2) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_tf_local_mode(tf_full_version, sagemaker_local_session): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=sagemaker_local_session) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def test_create_model(sagemaker_session, tf_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == tf_version assert model.py_version == tf.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_keras(sagemaker_session, tf_full_version): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') with timeout(minutes=45): estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py', source_dir=script_path, role='SageMakerRole', sagemaker_session=sagemaker_session, hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6}, training_steps=500, evaluation_steps=5, train_instance_count=1, train_instance_type='ml.c4.xlarge', train_max_run=45 * 60) inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name='test-cifar') estimator.fit('s3://mybucket/train') print('job succeeded: {}'.format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2') sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, {'Environment': {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR, 'SAGEMAKER_REQUIREMENTS': '', 'SAGEMAKER_REGION': REGION, 'SAGEMAKER_PROGRAM': SCRIPT}, 'Image': image, 'ModelDataUrl': 's3://m/m.tar.gz'})