def test_tf_vpc_multi(sagemaker_session, tf_full_version): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = 'ml.c4.xlarge' instance_count = 2 train_input = sagemaker_session.upload_data( path=os.path.join(DATA_DIR, 'iris', 'data'), key_prefix='integ-test-data/tf_iris') script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name) setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name='test-vpc-tf', subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(train_input) print('training job succeeded: {}'.format( estimator.latest_training_job.name)) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=estimator.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] assert job_desc['EnableInterContainerTrafficEncryption'] is True endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = estimator.create_model() json_predictor = model.deploy(initial_instance_count=instance_count, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=model.name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_tuning_tf_vpc_multi( sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = cpu_instance_type instance_count = 2 resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = "mnist.py" ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, source_dir=resource_path, role="SageMakerRole", framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist") tuning_job_name = unique_name_from_base("tune-tf", max_length=32) print( f"Started hyperparameter tuning job with name: {tuning_job_name}") tuner.fit(inputs, job_name=tuning_job_name)
def test_tuning_tf_vpc_multi(sagemaker_session): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = "ml.c4.xlarge" instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_region_name) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait()