def test_mxnet_local_data_local_script(): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=LocalNoS3Session()) train_input = 'file://' + os.path.join(data_path, 'train') test_input = 'file://' + os.path.join(data_path, 'test') mx.fit({'train': train_input, 'test': test_input}) endpoint_name = mx.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def mxnet_model(sagemaker_local_session): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) model = mx.create_model(1) return model
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) mx.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 model = mx.create_model(role=new_role, model_server_workers=model_server_workers) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', framework_version=mxnet_full_version, sagemaker_session=sagemaker_local_session) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=15): mx.fit({'train': train_input, 'test': test_input}) transform_input_path = os.path.join(data_path, 'transform') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) output_path = 'file://%s' % (str(tmpdir)) transformer = mx.transformer(1, 'local', assemble_with='Line', max_payload=1, strategy='SingleRecord', output_path=output_path) with local_mode_utils.lock(): transformer.transform(transform_input, content_type='text/csv', split_type='Line') transformer.wait() assert os.path.exists(os.path.join(str(tmpdir), 'data.csv.out'))
def test_mxnet_neo(strftime, sagemaker_session, mxnet_version, skip_if_mms_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version) inputs = 's3://mybucket/train' mx.fit(inputs=inputs) input_shape = {'data': [100, 1, 28, 28]} output_location = 's3://neo-sdk-test' compiled_model = mx.compile_model(target_instance_family='ml_c4', input_shape=input_shape, output_path=output_location) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == [ 'train', 'logs_for_job', 'sagemaker_client.describe_training_job', 'compile_model', 'wait_for_compilation_job' ] expected_compile_model_args = _create_compilation_job( json.dumps(input_shape), output_location) actual_compile_model_args = sagemaker_session.method_calls[3][2] assert expected_compile_model_args == actual_compile_model_args assert compiled_model.image == _neo_inference_image(mxnet_version) predictor = mx.deploy(1, CPU, use_compiled_model=True) assert isinstance(predictor, MXNetPredictor) with pytest.raises(Exception) as wrong_target: mx.deploy(1, CPU_C5, use_compiled_model=True) assert str(wrong_target.value).startswith('No compiled model for') # deploy without sagemaker Neo should continue to work mx.deploy(1, CPU)
def test_transform_mxnet(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}) transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) sts_client = sagemaker_session.boto_session.client('sts') account_id = sts_client.get_caller_identity()['Account'] kms_client = sagemaker_session.boto_session.client('kms') kms_key_arn = get_or_create_kms_key(kms_client, account_id) transformer = _create_transformer_and_transform_job( mx, transform_input, kms_key_arn) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() job_desc = transformer.sagemaker_session.sagemaker_client.describe_transform_job( TransformJobName=transformer.latest_transform_job.name) assert kms_key_arn == job_desc['TransformResources']['VolumeKmsKeyId']
def mxnet_model(sagemaker_local_session): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) model = mx.create_model(1) return model
def test_keras_training(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): keras_path = os.path.join(RESOURCE_PATH, 'keras') script_path = os.path.join(keras_path, 'keras_mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_name=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) train = 'file://{}'.format(os.path.join(keras_path, 'data')) mx.fit({'train': train}) for directory, files in MODEL_SUCCESS_FILES.items(): local_mode_utils.assert_output_files_exist(str(tmpdir), directory, files)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'mxnet:2.0' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'new_name' mx.fit(inputs='s3://mybucket/train', job_name='new_name') model = mx.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == custom_image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) mx.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = mx.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_mxnet_local_training_env(mxnet_training_latest_version, mxnet_training_latest_py_version): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "check_env.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", instance_count=1, instance_type="local", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, sagemaker_session=LocalNoS3Session(), environment={"MYVAR": "HELLO_WORLD"}, ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") mx.fit({"train": train_input, "test": test_input})
def test_mxnet_training_failure(sagemaker_local_session, mxnet_full_version, tmpdir): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "failure_script.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, output_path="file://{}".format(tmpdir), ) with pytest.raises(RuntimeError): mx.fit() with tarfile.open(os.path.join(str(tmpdir), "output.tar.gz")) as tar: tar.getmember("failure")
def test_transform_mxnet_tags(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}] mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') job_name = unique_name_from_base('test-mxnet-transform') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = mx.transformer(1, 'ml.m4.xlarge', tags=tags) transformer.transform(transform_input, content_type='text/csv') with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc['ModelArn'])['Tags'] assert tags == model_tags
def test_create_model(sagemaker_session, mxnet_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'new_name' mx.fit(inputs='s3://mybucket/train', job_name=job_name) model = mx.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == mxnet_version assert model.py_version == mx.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_create_model_with_custom_hosting_image(sagemaker_session): container_log_level = '"logging.INFO"' custom_image = "mxnet:2.0" custom_hosting_image = "mxnet_hosting:2.0" mx = MXNet( entry_point=SCRIPT_PATH, framework_version="2.0", py_version="py3", role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, image_uri=custom_image, container_log_level=container_log_level, base_job_name="job", ) mx.fit(inputs="s3://mybucket/train", job_name="new_name") model = mx.create_model(image_uri=custom_hosting_image) assert model.image_uri == custom_hosting_image
def _create_and_fit_estimator(mxnet_version, py_version, sagemaker_session, instance_type, tmpdir): job_name = sagemaker.utils.unique_name_from_base("mx-horovod") estimator = MXNet( entry_point=os.path.join(horovod_dir, "hvd_mnist_mxnet.py"), role="SageMakerRole", instance_count=2, instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=py_version, framework_version=mxnet_version, distribution={"mpi": {"enabled": True}}, ) with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(job_name=job_name) tmp = str(tmpdir) extract_files_from_s3(estimator.model_data, tmp, sagemaker_session) for rank in range(2): assert read_json("rank-%s" % rank, tmp)["rank"] == rank
def test_mxnet_local_data_local_script(mxnet_training_latest_version, mxnet_training_latest_py_version): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") local_no_s3_session = LocalNoS3Session() local_no_s3_session.boto_session.resource = Mock( side_effect=local_no_s3_session.boto_session.resource) local_no_s3_session.boto_session.client = Mock( side_effect=local_no_s3_session.boto_session.client) mx = MXNet( entry_point=script_path, role="SageMakerRole", instance_count=1, instance_type="local", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, sagemaker_session=local_no_s3_session, ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") mx.fit({"train": train_input, "test": test_input}) endpoint_name = mx.latest_training_job.name with lock.lock(LOCK_PATH): try: predictor = mx.deploy(1, "local", endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) # check if no boto_session s3 calls were made with pytest.raises(AssertionError): local_no_s3_session.boto_session.resource.assert_called_with( "s3", region_name=ANY) with pytest.raises(AssertionError): local_no_s3_session.boto_session.client.assert_called_with( "s3", region_name=ANY) finally: predictor.delete_endpoint()
def mxnet_estimator(sagemaker_session, mxnet_full_version, cpu_instance_type): mx = MXNet( entry_point=os.path.join(MXNET_MNIST_PATH, "mnist.py"), role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(MXNET_MNIST_PATH, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(MXNET_MNIST_PATH, "test"), key_prefix="integ-test-data/mxnet_mnist/test") job_name = unique_name_from_base("test-mxnet-transform") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) return mx
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' custom_image = 'mxnet:2.0' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'new_name' mx.fit(inputs='s3://mybucket/train', job_name='new_name') model = mx.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == custom_image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_transform_mxnet(sagemaker_session): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}) transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data(path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(mx, transform_input) transformer.wait()
def test_transform_mxnet_logs(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) job_name = unique_name_from_base("test-mxnet-transform") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix ) with timeout(minutes=45): transformer = _create_transformer_and_transform_job( mx, transform_input, cpu_instance_type, wait=True, logs=True ) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES ): transformer.wait()
def test_create_model_with_optional_params( sagemaker_session, mxnet_inference_version, mxnet_inference_py_version ): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" mx = MXNet( entry_point=SCRIPT_NAME, source_dir=source_dir, framework_version=mxnet_inference_version, py_version=mxnet_inference_py_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name="job", ) mx.fit(inputs="s3://mybucket/train", job_name="new_name") new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model_name = "model-name" model = mx.create_model( role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, env=ENV, name=model_name, ) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE assert model.env == ENV assert model.name == model_name
def test_create_model( name_from_base, sagemaker_session, mxnet_inference_version, mxnet_inference_py_version ): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" base_job_name = "job" mx = MXNet( entry_point=SCRIPT_NAME, source_dir=source_dir, framework_version=mxnet_inference_version, py_version=mxnet_inference_py_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name=base_job_name, ) mx.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = mx.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == mxnet_inference_version assert model.py_version == mxnet_inference_py_version assert model.entry_point == SCRIPT_NAME assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.image_uri is None assert model.vpc_config is None name_from_base.assert_called_with(base_job_name)
def test_mxnet_local_data_local_script(): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist_framework_mode.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=LocalNoS3Session()) train_input = 'file://' + os.path.join(data_path, 'train') test_input = 'file://' + os.path.join(data_path, 'test') mx.fit({'train': train_input, 'test': test_input}) endpoint_name = mx.latest_training_job.name with local_mode_utils.lock(): try: predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint()
def test_keras_training(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): if Version(framework_version) >= Version('1.9.0'): pytest.skip(f"Keras support has been deprecated MXNet 1.9.0 onwards") keras_path = os.path.join(RESOURCE_PATH, 'keras') script_path = os.path.join(keras_path, 'keras_mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', instance_count=1, instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_uri=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) train = 'file://{}'.format(os.path.join(keras_path, 'data')) mx.fit({'train': train}) for directory, files in MODEL_SUCCESS_FILES.items(): local_mode_utils.assert_output_files_exist(str(tmpdir), directory, files)
def test_deploy(sagemaker_session, tf_version): estimator = MXNet(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_GPU, sagemaker_session=sagemaker_session, base_job_name='test-cifar') estimator.fit('s3://mybucket/train') print('job succeeded: {}'.format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2') sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, {'Environment': {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR, 'SAGEMAKER_REGION': REGION, 'SAGEMAKER_PROGRAM': SCRIPT}, 'Image': image, 'ModelDataUrl': 's3://m/m.tar.gz'})
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client, sagemaker_session.boto_session.region_name) mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, security_group_ids=[security_group_id]) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}) job_desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mx.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data(path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(mx, transform_input) with timeout(minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model(ModelName=transformer.model_name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_mxnet(strftime, sagemaker_session, mxnet_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version) inputs = 's3://mybucket/train' mx.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(mxnet_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = mx.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py2' environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-mxnet-{}/source/sourcedir.tar.gz'.format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': expected_image_base.format(mxnet_version), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = mx.deploy(1, GPU) assert isinstance(predictor, MXNetPredictor)
def test_mxnet_local_mode(sagemaker_local_session, mxnet_full_version): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session, framework_version=mxnet_full_version) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) endpoint_name = mx.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def _create_model(output_path): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type="local", output_path=output_path, framework_version=mxnet_full_version, sagemaker_session=sagemaker_local_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) model = mx.create_model(1) return model
"sms_spam_classifier_mxnet_script.py", role=role, train_instance_count=1, train_instance_type="ml.c5.2xlarge", output_path=output_path, base_job_name="sms-spam-classifier-mxnet", framework_version="1.2", code_location=code_location, hyperparameters={ "batch_size": 100, "epochs": 20, "learning_rate": 0.01 }, py_version="py3", ) inputs = { "train": "s3://{0}/{1}/train/".format(bucket_name, bucket_key_prefix), "val": "s3://{0}/{1}/val/".format(bucket_name, bucket_key_prefix), } m.fit(inputs) # deploy the model on sage maker endpoint mxnet_pred = m.deploy( initial_instance_count=1, instance_type="ml.t2.medium", endpoint_name="sagemaker-endpoint", )
# job_name=job_name, # channel_input_dirs=channel_input_dirs, output_path=output_path # output bucket name ) # adding information that is job/runtime specific # note: this isn't being written back to the config file config_manager.put('sagemaker_job_info', 'job_name', job_name) config_manager.put('sagemaker_job_info', 'ckpt_dir', ckpt_dir) config_manager.put('sagemaker_job_info', 'timestamp', timestamp) # write a log for this job run to a new file history_save_path = 'train_history/' config_manager.write_copy(os.path.join(history_save_path, job_name + '.json')) # Call evaluate in seperate process # Process(evaluate_on_timer, ('csvs/test_data.csv', )) # Call Fit. Train path expected to contain both a train_data.csv and test_data.csv file train_path = cfg['train_path'] mx_estimator.fit({"train": str(train_path), "test": str(train_path)}) # Write termination time end = time.time() print("Total traintime: {}".format(end - ts)) config_manager.put('sagemaker_job_info', 'train_runtime', end - ts) config_manager.write_copy(os.path.join(history_save_path, job_name + '.json'))
instance_type = 'ml.p3.2xlarge' model = MXNet( source_dir='source', entry_point='model.py', py_version='py3', framework_version='1.4.1', train_instance_count=1, train_instance_type=instance_type, role=role, train_use_spot_instances=True, train_max_wait=24 * 60 * 60, metric_definitions=[ # publish algo metrics to Cloudwatch { 'Name': 'train_acc', 'Regex': "^.*epoch : accuracy = ([0-9.]+).*$" }, { 'Name': 'test_acc', 'Regex': "Test: accuracy: ([0-9.]+).*$" } ]) inputs = remote_inputs model.fit(inputs={ 'train': inputs + '/train', 'val': inputs + '/val', 'test': inputs + '/test', 'rgb': inputs + '/RGB' }, wait=True)
# * Instantiate an estimator object and pass in the code as the entry point parameter. # * Train and deploy the model # In[41]: mnist_estimator = MXNet(entry_point='mnist.py', role=role, output_path=model_artifacts_location, code_location=custom_code_upload_location, train_instance_count=1, train_instance_type='ml.m4.xlarge', hyperparameters={'learning_rate': 0.1}) # In[42]: mnist_estimator.fit({'train': train_data_location, 'test': test_data_location}) # In[43]: predictor = mnist_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge') # ## Validating the model # * Invoke the html script to read in an input. The pixel data from your drawing will be loaded into a data variable in this notebook. # * Using the predictor object to classify the handwritten digit. # * Raw predictions and Labelled predictions display the probabilities of the digit being each of the defined labels. # * Most likely answer prints the label with the maximum probability. # In[76]: HTML(open("input.html").read())
sagemaker_session=sagemaker_session, entry_point="smtrain.py", source_dir="../benchmarks/tr-gpu/mx", role="SageMakerRole", train_instance_count=12, train_instance_type="ml.p3.16xlarge", image_name= "841569659894.dkr.ecr.us-east-1.amazonaws.com/beta-mxnet-training:1.4.1-py3-gpu-build", py_version="py3", output_path="s3://bai-results-sagemaker", train_volume_size=200, framework_version="1.4", distributions={"parameter_server": { "enabled": True }}, ) data = { #"s1": "s3://mxnet-bln-data-sagemaker/small" "train": "s3://mxnet-asimov-data-sagemaker/imagenet/processed/train-480px-q95.rec", "trainidx": "s3://mxnet-asimov-data-sagemaker/imagenet/processed/train-480px-q95.idx", "validate": "s3://mxnet-asimov-data-sagemaker/imagenet/processed/val-480px-q95.rec", "validx": "s3://mxnet-asimov-data-sagemaker/imagenet/processed/val-480px-q95.idx", } tf_estimator.fit(data, logs=True, wait=True)