def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PytorchTestEstimator(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, docker_image_uri=ecr_image, hyperparameters={'backend': dist_backend, 'epochs': 1}) training_input = pytorch.sagemaker_session.upload_data(path=training_dir, key_prefix='pytorch/mnist') pytorch.fit({'training': training_input}) with timeout_and_delete_endpoint(estimator=pytorch, minutes=30): predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', train_instance_count=train_instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'backend': dist_backend}) pytorch.sagemaker_session.default_bucket() fake_input = pytorch.sagemaker_session.upload_data( path=dist_operations_path, key_prefix='pytorch/distributed_operations') pytorch.fit({'required_argument': fake_input})
def test_training_smdebug(sagemaker_session, ecr_image, instance_type): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/opt/ml/output/tensors', 'epochs': 1, 'data_dir': training_dir } with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=smdebug_mnist_script, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix='pytorch/mnist') pytorch.fit({'training': training_input})
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point='train_cifar.py', source_dir=os.path.join(fastai_path, 'cifar'), role='SageMakerRole', train_instance_count=1, train_instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, image_name=ecr_image) pytorch.sagemaker_session.default_bucket() training_input = pytorch.sagemaker_session.upload_data( path=os.path.join(fastai_path, 'cifar_tiny', 'training'), key_prefix='pytorch/distributed_operations' ) job_name = utils.unique_name_from_base('test-pytorch-dist-ops') pytorch.fit({'training': training_input}, job_name=job_name) model_s3_url = pytorch.create_model().model_data _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image, py_version): if py_version != PYTHON3: print('Skipping the test because fastai supports >= Python 3.6.') return with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point='train_cifar.py', source_dir=os.path.join(fastai_path, 'cifar'), role='SageMakerRole', train_instance_count=1, train_instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, image_name=ecr_image) pytorch.sagemaker_session.default_bucket() training_input = pytorch.sagemaker_session.upload_data( path=os.path.join(fastai_path, 'cifar_tiny', 'training'), key_prefix='pytorch/distributed_operations') pytorch.fit({'training': training_input}) model_s3_url = pytorch.create_model().model_data _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def test_training(sagemaker_session, ecr_image, instance_type): from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client experiment_name = "pytorch-container-integ-test-{}".format(int( time.time())) experiment = Experiment.create( experiment_name=experiment_name, description= "Integration test full customer e2e from sagemaker-pytorch-container", sagemaker_boto_client=sm_client, ) trial_name = "pytorch-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) hyperparameters = { "random_seed": True, "num_steps": 50, "smdebug_path": "/opt/ml/output/tensors", "epochs": 1, "data_dir": training_dir, } training_job_name = utils.unique_name_from_base( "test-pytorch-experiments-image") # create a training job and wait for it to complete with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=smdebug_mnist_script, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters, ) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix="pytorch/mnist") pytorch.fit({"training": training_input}, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()