def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = pytorch.create_model(role=new_role,
                                 model_server_workers=model_server_workers,
                                 vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
Esempio n. 2
0
def test_dist_operations_fastai_gpu(sagemaker_session, framework_version,
                                    ecr_image):
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    if Version(image_framework_version) == Version("1.9"):
        pytest.skip("Fast ai is not supported on PyTorch v1.9 ")

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point='train_cifar.py',
            source_dir=os.path.join(fastai_path, 'cifar'),
            role='SageMakerRole',
            instance_count=1,
            instance_type=MULTI_GPU_INSTANCE,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
        )
        pytorch.sagemaker_session.default_bucket()
        training_input = pytorch.sagemaker_session.upload_data(
            path=os.path.join(fastai_path, 'cifar_tiny', 'training'),
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'training': training_input},
                    job_name=utils.unique_name_from_base('test-pt-fastai'))

    model_s3_url = pytorch.create_model().model_data
    _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
Esempio n. 3
0
def train_in_sagemaker(role, data_channels: dict, server_source_dir: str,
                       aws_account_id: str, aws_region: str, device: str,
                       debug: bool, hyperparameters: dict):
    instance_type, image_version = __get_instance_info(device=device,
                                                       debug=debug,
                                                       mode="training")

    # create estimator
    image_url_training = "{}.dkr.ecr.{}.amazonaws.com/youyakuman:{}".format(
        aws_account_id, aws_region, image_version)
    print("image_url : {}".format(image_url_training))
    estimator = PyTorch(entry_point="youyakuman_train_and_deploy.py",
                        source_dir=server_source_dir,
                        role=role,
                        framework_version='1.5.0',
                        train_instance_count=1,
                        train_instance_type=instance_type,
                        hyperparameters=hyperparameters,
                        image_name=image_url_training)

    # start to train
    date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    job_name = "youyakuman-{}-{}".format(device, date_str)
    print("job_name is {}".format(job_name))
    estimator.fit(data_channels, job_name=job_name)

    return estimator, job_name
Esempio n. 4
0
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type,
                            dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={
                              'backend': dist_backend,
                              'epochs': 1
                          })
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})

    with timeout_and_delete_endpoint(estimator=pytorch, minutes=30):
        predictor = pytorch.deploy(initial_instance_count=1,
                                   instance_type=instance_type)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    image = 'pytorch:9000'
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level,
                      image_name=image,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'new_name'
    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_horovod_training(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "train.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
            "epochs": 1,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()
Esempio n. 7
0
def _test_dgl_LT_09x_training(ecr_image, sagemaker_session, instance_type):
    dgl = PyTorch(
        entry_point=DGL_LT_09x_SCRIPT_PATH,
        role="SageMakerRole",
        instance_count=1,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_uri=ecr_image,
    )
    with timeout(minutes=DEFAULT_TIMEOUT):
        job_name = utils.unique_name_from_base("test-pytorch-dgl-image")
        dgl.fit(job_name=job_name)
Esempio n. 8
0
def test_mnist_gpu(sagemaker_session, ecr_image, py_version, dist_gpu_backend):
    pytorch = PyTorch(entry_point=mnist_script,
                      role='SageMakerRole',
                      train_instance_count=2,
                      image_name=ecr_image,
                      train_instance_type=MULTI_GPU_INSTANCE,
                      sagemaker_session=sagemaker_session,
                      hyperparameters={'backend': dist_gpu_backend})

    training_input = sagemaker_session.upload_data(path=os.path.join(
        data_dir, 'training'),
                                                   key_prefix='pytorch/mnist')
    pytorch.fit({'training': training_input})
Esempio n. 9
0
def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=dist_operations_path,
                          role='SageMakerRole',
                          train_instance_count=train_instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={'backend': dist_backend})
        pytorch.sagemaker_session.default_bucket()
        fake_input = pytorch.sagemaker_session.upload_data(path=dist_operations_path,
                                                           key_prefix='pytorch/distributed_operations')
        pytorch.fit({'required_argument': fake_input})
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      framework_version=pytorch_version,
                      py_version=PYTHON_VERSION)

    inputs = 's3://mybucket/train'

    pytorch.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}'
    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.
            format(TIMESTAMP),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION),
        'ModelDataUrl': 's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
Esempio n. 11
0
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image,
                                                  instance_type, py_version,
                                                  n_virginia_sagemaker_session,
                                                  tmpdir, test_script,
                                                  num_processes):
    """
    Tests pt mnist command via script mode
    """
    instance_type = "ml.p3.16xlarge"
    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=test_script,
            role='SageMakerRole',
            image_uri=n_virginia_ecr_image,
            source_dir=mnist_path,
            instance_count=2,
            instance_type=instance_type,
            sagemaker_session=n_virginia_sagemaker_session,
            hyperparameters={
                "assert-losses": 1,
                "amp": 1,
                "ddp": 1,
                "data-dir": "data/training",
                "epochs": 5
            },
            distribution={
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True,
                        "parameters": {
                            "partitions": 2,
                            "microbatches": 4,
                            "optimize": "speed",
                            "pipeline": "interleaved",
                            "ddp": True,
                        },
                    }
                },
                "mpi": {
                    "enabled":
                    True,
                    "processes_per_host":
                    num_processes,
                    "custom_mpi_options":
                    "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                },
            },
        )
        pytorch.fit(
            job_name=utils.unique_name_from_base('test-pt-smdmp-multinode'))
def test_horovod_simple(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    default_bucket = sagemaker_session.default_bucket()
    output_path = "s3://" + os.path.join(default_bucket, "pytorch/horovod")

    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "simple.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        output_path=output_path,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()

    bucket, key_prefix = estimator.model_data.replace("s3://",
                                                      "").split("/", 1)
    sagemaker_session.download_data(path=str(tmpdir),
                                    bucket=bucket,
                                    key_prefix=key_prefix)

    with tarfile.open(os.path.join(str(tmpdir), "model.tar.gz")) as tar:
        tar.extractall(tmpdir)

    size = instances * processes

    for rank in range(size):
        local_rank = rank % processes
        # The simple.py script should create a JSON file with this name
        filename = "local-rank-%s-rank-%s.json" % (local_rank, rank)

        with open(os.path.join(str(tmpdir), filename)) as file:
            actual = json.load(file)
        expected = {"local-rank": local_rank, "rank": rank, "size": size}

        assert actual == expected
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level, base_job_name='job', source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    image = 'pytorch:9000'
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level, image_name=image,
                      base_job_name='job', source_dir=source_dir)

    job_name = 'new_name'
    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      framework_version=pytorch_version, py_version=PYTHON_VERSION)

    inputs = 's3://mybucket/train'

    pytorch.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}'
    assert {'Environment':
            {'SAGEMAKER_SUBMIT_DIRECTORY':
             's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.format(TIMESTAMP),
             'SAGEMAKER_PROGRAM': 'dummy_script.py',
             'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
             'SAGEMAKER_REGION': 'us-west-2',
             'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'},
            'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION),
            'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)