import pytest import sagemaker.utils import tests.integ as integ from sagemaker.pytorch import PyTorch from tests.integ import timeout from tests.integ.test_pytorch import _upload_training_data smdataparallel_dir = os.path.join(os.path.dirname(__file__), "..", "data", "smdistributed_dataparallel") @pytest.mark.skipif( integ.test_region() not in integ.DATA_PARALLEL_TESTING_REGIONS, reason= "Only allow this test to run in IAD and CMH to limit usage of p3.16xlarge", ) def test_smdataparallel_pt_mnist( sagemaker_session, pytorch_training_latest_version, pytorch_training_latest_py_version, ): job_name = sagemaker.utils.unique_name_from_base( "pt-sm-distributed-dataparallel") estimator = PyTorch( entry_point="mnist_pt.py", role="SageMakerRole", source_dir=smdataparallel_dir, instance_count=2,
# language governing permissions and limitations under the License. from __future__ import absolute_import import os import pytest from sagemaker.huggingface import HuggingFace from tests import integ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout @pytest.mark.release @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region", ) def test_huggingface_training( sagemaker_session, gpu_instance_type, huggingface_training_latest_version, huggingface_pytorch_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py36", entry_point="examples/text-classification/run_glue.py", role="SageMakerRole",
entry_point="mnist.py", framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10) @pytest.mark.skipif( test_region() not in EI_SUPPORTED_REGIONS, reason="EI isn't supported in that specific region." ) def test_deploy_model_with_accelerator( sagemaker_session, cpu_instance_type, pytorch_eia_latest_version, pytorch_eia_latest_py_version, ): endpoint_name = "test-pytorch-deploy-eia-{}".format(sagemaker_timestamp()) model_data = sagemaker_session.upload_data(path=EIA_MODEL) pytorch = PyTorchModel( model_data, "SageMakerRole", entry_point=EIA_SCRIPT, framework_version=pytorch_eia_latest_version, py_version=pytorch_eia_latest_py_version,
sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10) @pytest.mark.skipif(PYTHON_VERSION == "py2", reason="PyTorch EIA does not support Python 2.") @pytest.mark.skipif(test_region() not in EI_SUPPORTED_REGIONS, reason="EI isn't supported in that specific region.") def test_deploy_model_with_accelerator(sagemaker_session, cpu_instance_type): endpoint_name = "test-pytorch-deploy-eia-{}".format(sagemaker_timestamp()) model_data = sagemaker_session.upload_data(path=EIA_MODEL) pytorch = PyTorchModel( model_data, "SageMakerRole", framework_version="1.3.1", entry_point=EIA_SCRIPT, sagemaker_session=sagemaker_session, ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = pytorch.deploy( initial_instance_count=1, instance_type=cpu_instance_type,
endpoint_name = "test-mxnet-coach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( 1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name ) observation = numpy.asarray([0, 0, 0, 0]) action = predictor.predict(observation) assert 0 < action[0][0] < 1 assert 0 < action[0][1] < 1 @pytest.mark.skipif( test_region() not in RL_SUPPORTED_REGIONS, reason="Updated RL images aren't in {}".format(test_region()), ) def test_coach_tf(sagemaker_session, coach_tensorflow_latest_version, cpu_instance_type): estimator = _test_coach( sagemaker_session, RLFramework.TENSORFLOW, coach_tensorflow_latest_version, cpu_instance_type, ) job_name = unique_name_from_base("test-coach-tf") with timeout(minutes=15): estimator.fit(job_name=job_name) endpoint_name = "test-tf-coach-deploy-{}".format(sagemaker_timestamp())
horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod") @pytest.fixture(scope="module") def gpu_instance_type(request): return "ml.p2.xlarge" @pytest.mark.canary_quick def test_hvd_cpu(sagemaker_session, cpu_instance_type, tmpdir): __create_and_fit_estimator(sagemaker_session, cpu_instance_type, tmpdir) @pytest.mark.canary_quick @pytest.mark.skipif(integ.test_region() in integ.HOSTING_NO_P2_REGIONS, reason="no ml.p2 instances in this region") def test_hvd_gpu(sagemaker_session, gpu_instance_type, tmpdir): __create_and_fit_estimator(sagemaker_session, gpu_instance_type, tmpdir) @pytest.mark.local_mode @pytest.mark.parametrize("instances, processes", [[1, 2], (2, 1), (2, 2)]) def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir): output_path = "file://%s" % tmpdir job_name = sagemaker.utils.unique_name_from_base("tf-horovod") estimator = TensorFlow( entry_point=os.path.join(horovod_dir, "hvd_basic.py"), role="SageMakerRole", train_instance_count=2,
sagemaker_session=sagemaker_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) mx.fit({"train": train_input, "test": test_input}) return mx.latest_training_job.name @pytest.mark.skipif( test_region() not in EDGE_PACKAGING_SUPPORTED_REGIONS, reason="Edge packaging isn't supported in that specific region.", ) def test_edge_packaging_job(mxnet_training_job, sagemaker_session): estimator = MXNet.attach(mxnet_training_job, sagemaker_session=sagemaker_session) model = estimator.compile_model( target_instance_family="rasp3b", input_shape={"data": [1, 1, 28, 28], "softmax_label": [1]}, output_path=estimator.output_path, ) model.package_for_edge( output_path=estimator.output_path, role=estimator.role, model_name="sdk-test-model", model_version="1.0",
import sagemaker.utils import tests.integ as integ from sagemaker.tensorflow import TensorFlow from tests.integ import test_region, timeout, HOSTING_NO_P3_REGIONS horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod") @pytest.fixture( scope="session", params=[ "ml.c4.xlarge", pytest.param( "ml.p3.2xlarge", marks=pytest.mark.skipif( test_region() in HOSTING_NO_P3_REGIONS, reason="no ml.p3 instances in this region"), ), ], ) def instance_type(request): return request.param @pytest.mark.canary_quick def test_horovod(sagemaker_session, instance_type, tmpdir): job_name = sagemaker.utils.unique_name_from_base("tf-horovod") estimator = TensorFlow( entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"), role="SageMakerRole", train_instance_count=2,
import pytest from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig from tests import integ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout @pytest.fixture(scope="module") def gpu_instance_type(request): return "ml.p3.2xlarge" @pytest.mark.release @pytest.mark.skipif( integ.test_region() not in integ.TRAINING_COMPILER_SUPPORTED_REGIONS, reason="SageMaker Training Compiler is not supported in this region", ) @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P3_REGIONS, reason="no ml.p3 instances in this region", ) def test_huggingface_pytorch( sagemaker_session, gpu_instance_type, huggingface_training_compiler_latest_version, huggingface_training_compiler_pytorch_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface")
sagemaker_session, cpu_instance_type, tmpdir, ): _create_and_fit_estimator( mxnet_training_latest_version, mxnet_training_latest_py_version, sagemaker_session, cpu_instance_type, tmpdir, ) @pytest.mark.release @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P2_REGIONS and integ.test_region() in integ.TRAINING_NO_P3_REGIONS, reason="no ml.p2 or ml.p3 instances in this region", ) def test_hvd_gpu( mxnet_training_latest_version, mxnet_training_latest_py_version, sagemaker_session, gpu_instance_type, tmpdir, ): _create_and_fit_estimator( mxnet_training_latest_version, mxnet_training_latest_py_version, sagemaker_session, gpu_instance_type,
# language governing permissions and limitations under the License. from __future__ import absolute_import import os import pytest from sagemaker.huggingface import HuggingFace from tests import integ from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout @pytest.mark.release @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region" ) def test_huggingface_training( sagemaker_session, gpu_instance_type, huggingface_training_latest_version, huggingface_pytorch_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py36", entry_point="examples/text-classification/run_glue.py", role="SageMakerRole", transformers_version=huggingface_training_latest_version,
import pytest from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor from sagemaker.utils import unique_name_from_base from tests import integ from tests.integ.utils import gpu_list, retry_with_instance_list from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name ROLE = "SageMakerRole" @pytest.mark.release @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P2_REGIONS and integ.test_region() in integ.TRAINING_NO_P3_REGIONS, reason="no ml.p2 or ml.p3 instances in this region", ) @retry_with_instance_list(gpu_list(integ.test_region())) def test_framework_processing_job_with_deps( sagemaker_session, huggingface_training_latest_version, huggingface_training_pytorch_latest_version, huggingface_pytorch_latest_training_py_version, **kwargs, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs") entry_point = "main_script.py"