Exemple #1
0
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type,
                            dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={
                              'backend': dist_backend,
                              'epochs': 1
                          })
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})

    with timeout_and_delete_endpoint(estimator=pytorch, minutes=30):
        predictor = pytorch.deploy(initial_instance_count=1,
                                   instance_type=instance_type)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
Exemple #2
0
def main():
    testloader = download_training_data()

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}

    # For local training a dummy role will be sufficient
    role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

    print('Starting model training')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
    )
    cifar10_estimator = PyTorch(entry_point='cifar10_pytorch.py',
                                source_dir='./code',
                                role=role,
                                framework_version='1.7.1',
                                py_version='py3',
                                instance_count=1,
                                instance_type='local',
                                hyperparameters={
                                    'epochs': 1,
                                })

    cifar10_estimator.fit('file://./data/')

    print('Deploying local mode endpoint')
    predictor = cifar10_estimator.deploy(initial_instance_count=1,
                                         instance_type='local')

    do_inference_on_local_endpoint(predictor, testloader)

    predictor.delete_endpoint(predictor.endpoint)
    predictor.delete_model()
Exemple #3
0
def test_pytorch(time, name_from_base, sagemaker_session,
                 pytorch_inference_version, pytorch_inference_py_version):
    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        framework_version=pytorch_inference_version,
        py_version=pytorch_inference_py_version,
        enable_sagemaker_metrics=False,
    )

    inputs = "s3://mybucket/train"

    pytorch.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(pytorch_inference_version,
                                            pytorch_inference_py_version)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG
    expected_train_args["enable_sagemaker_metrics"] = False

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_uri = image_uris.retrieve(
        "pytorch",
        REGION,
        version=pytorch_inference_version,
        py_version=pytorch_inference_py_version,
        instance_type=GPU,
        image_scope="inference",
    )

    actual_environment = model.prepare_container_def(GPU)
    submit_directory = actual_environment["Environment"][
        "SAGEMAKER_SUBMIT_DIRECTORY"]
    model_url = actual_environment["ModelDataUrl"]
    expected_environment = _get_environment(submit_directory, model_url,
                                            expected_image_uri)
    assert actual_environment == expected_environment

    assert "cpu" in model.prepare_container_def(CPU)["Image"]
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
Exemple #4
0
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=pytorch_version,
        py_version=PYTHON_VERSION,
    )

    inputs = "s3://mybucket/train"

    pytorch.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}"
    assert {
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY":
            "s3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz".
            format(TIMESTAMP),
            "SAGEMAKER_PROGRAM":
            "dummy_script.py",
            "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS":
            "false",
            "SAGEMAKER_REGION":
            "us-west-2",
            "SAGEMAKER_CONTAINER_LOG_LEVEL":
            "20",
        },
        "Image": expected_image_base.format(pytorch_version, PYTHON_VERSION),
        "ModelDataUrl": "s3://m/m.tar.gz",
    } == model.prepare_container_def(GPU)

    assert "cpu" in model.prepare_container_def(CPU)["Image"]
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      framework_version=pytorch_version,
                      py_version=PYTHON_VERSION)

    inputs = 's3://mybucket/train'

    pytorch.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}'
    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.
            format(TIMESTAMP),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION),
        'ModelDataUrl': 's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
def generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session):
    s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/train.csv'\
                                        .format(bucket, prefix, ticker), content_type='text/csv')
    s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/validation.csv'\
                                             .format(bucket, prefix, ticker), content_type='text/csv')
    estimator = PyTorch(
        entry_point='train.py',
        source_dir='pytorch',  # this should be just "source" for your code
        role=role,
        framework_version='1.0',
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'input_dim': 26,  # num of features
            'hidden_dim': 260,
            'output_dim': 1,
            'epochs': 200  # could change to higher
        })
    estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})
    predictor = estimator.deploy(initial_instance_count=1,
                                 instance_type="ml.m4.xlarge")
    return predictor
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      framework_version=pytorch_version, py_version=PYTHON_VERSION)

    inputs = 's3://mybucket/train'

    pytorch.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}'
    assert {'Environment':
            {'SAGEMAKER_SUBMIT_DIRECTORY':
             's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.format(TIMESTAMP),
             'SAGEMAKER_PROGRAM': 'dummy_script.py',
             'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
             'SAGEMAKER_REGION': 'us-west-2',
             'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'},
            'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION),
            'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
# 
# **NOTE**: When the built-in inference code is run it must import the `model_fn()` method from the `train.py` file. This is why the training code is wrapped in a main guard ( ie, `if __name__ == '__main__':` )
# 
# Since we don't need to change anything in the code that was uploaded during training, we can simply deploy the current model as-is.
# 
# **NOTE:** When deploying a model you are asking SageMaker to launch an compute instance that will wait for data to be sent to it. As a result, this compute instance will continue to run until *you* shut it down. This is important to know since the cost of a deployed endpoint depends on how long it has been running for.
# 
# In other words **If you are no longer using a deployed endpoint, shut it down!**
# 
# **TODO:** Deploy the trained model.

# In[28]:


# TODO: Deploy the trained model
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')


# ## Step 7 - Use the model for testing
# 
# Once deployed, we can read in the test data and send it off to our deployed model to get some results. Once we collect all of the results we can determine how accurate our model is.

# In[29]:


test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)


# In[30]:

Exemple #9
0
print(f"input spec (in this case, just an S3 path): {inputs}")

estimator = PyTorch(
    entry_point="code/mnist.py",
    # source_dir="code",
    role=role,
    framework_version="1.4.0",
    instance_count=2,
    instance_type="ml.p3.2xlarge",

    # instance_type="ml.m4.xlarge",
    py_version="py3",
    #use_spot_instances=True,  # Use a spot instance
    #max_run=300,  # Max training time
    #max_wait=600,  # Max training time + spot waiting time
    hyperparameters={
        "epochs": 20,
        "backend": "gloo"
    },
)

print(f"Training job name: {job_name}")

estimator.fit({"training": "s3://" + bucket + "/" + prefix}, job_name=job_name)

# Deploy the model
endpoint_name = f"{stack_name}-{commit_id[:7]}"
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type="ml.m4.xlarge",
                             endpoint_name=endpoint_name)