def test_horovod_training(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "train.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
            "epochs": 1,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()
def test_horovod_simple(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    default_bucket = sagemaker_session.default_bucket()
    output_path = "s3://" + os.path.join(default_bucket, "pytorch/horovod")

    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "simple.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        output_path=output_path,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()

    bucket, key_prefix = estimator.model_data.replace("s3://",
                                                      "").split("/", 1)
    sagemaker_session.download_data(path=str(tmpdir),
                                    bucket=bucket,
                                    key_prefix=key_prefix)

    with tarfile.open(os.path.join(str(tmpdir), "model.tar.gz")) as tar:
        tar.extractall(tmpdir)

    size = instances * processes

    for rank in range(size):
        local_rank = rank % processes
        # The simple.py script should create a JSON file with this name
        filename = "local-rank-%s-rank-%s.json" % (local_rank, rank)

        with open(os.path.join(str(tmpdir), filename)) as file:
            actual = json.load(file)
        expected = {"local-rank": local_rank, "rank": rank, "size": size}

        assert actual == expected
def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          image_name=image_uri,
                          train_instance_type=MULTI_GPU_INSTANCE,
                          sagemaker_session=sagemaker_session,
                          hyperparameters={'backend': dist_gpu_backend})

        training_input = sagemaker_session.upload_data(
            path=os.path.join(data_dir, 'training'),
            key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})
Esempio n. 4
0
def _test_mnist_distributed(sagemaker_session, image_uri, instance_type,
                            dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=image_uri,
                          hyperparameters={
                              'backend': dist_backend,
                              'epochs': 1
                          })
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})
def _test_dist_operations(sagemaker_session,
                          image_uri,
                          instance_type,
                          dist_backend,
                          train_instance_count=3):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=dist_operations_path,
                          role='SageMakerRole',
                          train_instance_count=train_instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=image_uri,
                          hyperparameters={'backend': dist_backend})
        pytorch.sagemaker_session.default_bucket()
        fake_input = pytorch.sagemaker_session.upload_data(
            path=dist_operations_path,
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'required_argument': fake_input})
def test_smdataparallel_training(instances, train_instance_type,
                                 sagemaker_session, image_uri,
                                 framework_version, tmpdir):
    default_bucket = sagemaker_session.default_bucket()
    output_path = "s3://" + os.path.join(default_bucket,
                                         "pytorch/smdataparallel")

    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "mnist",
                                 "smdataparallel_mnist.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        output_path=output_path,
        framework_version=framework_version,
        hyperparameters={"sagemaker_distributed_dataparallel_enabled": True})

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()