def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PytorchTestEstimator(entry_point=mnist_script, role='SageMakerRole',
                                       train_instance_count=2, train_instance_type=instance_type,
                                       sagemaker_session=sagemaker_session, docker_image_uri=ecr_image,
                                       hyperparameters={'backend': dist_backend, 'epochs': 1})
        training_input = pytorch.sagemaker_session.upload_data(path=training_dir,
                                                               key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})

    with timeout_and_delete_endpoint(estimator=pytorch, minutes=30):
        predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
def _test_dist_operations(sagemaker_session,
                          ecr_image,
                          instance_type,
                          dist_backend,
                          train_instance_count=3):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=dist_operations_path,
                          role='SageMakerRole',
                          train_instance_count=train_instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={'backend': dist_backend})
        pytorch.sagemaker_session.default_bucket()
        fake_input = pytorch.sagemaker_session.upload_data(
            path=dist_operations_path,
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'required_argument': fake_input})
def test_training_smdebug(sagemaker_session, ecr_image, instance_type):
    hyperparameters = {
        'random_seed': True,
        'num_steps': 50,
        'smdebug_path': '/opt/ml/output/tensors',
        'epochs': 1,
        'data_dir': training_dir
    }

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=smdebug_mnist_script,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters=hyperparameters)
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})
Example #4
0
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point='train_cifar.py',
                          source_dir=os.path.join(fastai_path, 'cifar'),
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type=MULTI_GPU_INSTANCE,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image)

        pytorch.sagemaker_session.default_bucket()
        training_input = pytorch.sagemaker_session.upload_data(
            path=os.path.join(fastai_path, 'cifar_tiny', 'training'),
            key_prefix='pytorch/distributed_operations'
        )

        job_name = utils.unique_name_from_base('test-pytorch-dist-ops')
        pytorch.fit({'training': training_input}, job_name=job_name)

    model_s3_url = pytorch.create_model().model_data
    _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image, py_version):
    if py_version != PYTHON3:
        print('Skipping the test because fastai supports >= Python 3.6.')
        return

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point='train_cifar.py',
                          source_dir=os.path.join(fastai_path, 'cifar'),
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type=MULTI_GPU_INSTANCE,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image)
        pytorch.sagemaker_session.default_bucket()
        training_input = pytorch.sagemaker_session.upload_data(
            path=os.path.join(fastai_path, 'cifar_tiny', 'training'),
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'training': training_input})

    model_s3_url = pytorch.create_model().model_data
    _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def test_training(sagemaker_session, ecr_image, instance_type):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "pytorch-container-integ-test-{}".format(int(
        time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test full customer e2e from sagemaker-pytorch-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "pytorch-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
        "data_dir": training_dir,
    }

    training_job_name = utils.unique_name_from_base(
        "test-pytorch-experiments-image")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=smdebug_mnist_script,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            hyperparameters=hyperparameters,
        )
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix="pytorch/mnist")
        pytorch.fit({"training": training_input}, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()