def test_tf_vpc_multi(sagemaker_session, tf_full_version): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = 'ml.c4.xlarge' instance_count = 2 train_input = sagemaker_session.upload_data( path=os.path.join(DATA_DIR, 'iris', 'data'), key_prefix='integ-test-data/tf_iris') script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name) setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name='test-vpc-tf', subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(train_input) print('training job succeeded: {}'.format( estimator.latest_training_job.name)) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=estimator.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] assert job_desc['EnableInterContainerTrafficEncryption'] is True endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = estimator.create_model() json_predictor = model.deploy(initial_instance_count=instance_count, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=model.name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_tuning_tf_vpc_multi( sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = cpu_instance_type instance_count = 2 resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = "mnist.py" ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, source_dir=resource_path, role="SageMakerRole", framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist") tuning_job_name = unique_name_from_base("tune-tf", max_length=32) print( f"Started hyperparameter tuning job with name: {tuning_job_name}") tuner.fit(inputs, job_name=tuning_job_name)
def test_transform_pytorch_vpc_custom_model_bucket( sagemaker_session, pytorch_inference_latest_version, pytorch_inference_latest_py_version, cpu_instance_type, custom_bucket_name, ): data_dir = os.path.join(DATA_DIR, "pytorch_mnist") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client) model_data = sagemaker_session.upload_data( path=os.path.join(data_dir, "model.tar.gz"), bucket=custom_bucket_name, key_prefix="integ-test-data/pytorch_mnist/model", ) model = PyTorchModel( model_data=model_data, entry_point=os.path.join(data_dir, "mnist.py"), role="SageMakerRole", framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, sagemaker_session=sagemaker_session, vpc_config={ "Subnets": subnet_ids, "SecurityGroupIds": [security_group_id] }, code_location="s3://{}".format(custom_bucket_name), ) transform_input = sagemaker_session.upload_data( path=os.path.join(data_dir, "transform", "data.npy"), key_prefix="integ-test-data/pytorch_mnist/transform", ) transformer = model.transformer(1, cpu_instance_type) transformer.transform( transform_input, content_type="application/x-npy", job_name=unique_name_from_base("test-transform-vpc"), ) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"]) assert [security_group_id ] == model_desc["VpcConfig"]["SecurityGroupIds"] model_bucket, _ = s3.parse_s3_url( model_desc["PrimaryContainer"]["ModelDataUrl"]) assert custom_bucket_name == model_bucket
def test_tuning_tf_vpc_multi(sagemaker_session): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = "ml.c4.xlarge" instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_region_name) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait()
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name ) mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, security_group_ids=[security_group_id], ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) job_name = unique_name_from_base("test-mxnet-vpc") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mx.latest_training_job.name ) assert set(subnet_ids) == set(job_desc["VpcConfig"]["Subnets"]) assert [security_group_id] == job_desc["VpcConfig"]["SecurityGroupIds"] transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix ) transformer = _create_transformer_and_transform_job(mx, transform_input) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES ): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name ) assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"]) assert [security_group_id] == model_desc["VpcConfig"]["SecurityGroupIds"]
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name) mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, security_group_ids=[security_group_id]) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mx.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(mx, transform_input) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id ] == model_desc['VpcConfig']['SecurityGroupIds']