Example #1
0
def test_sagemakermodel_passes_correct_params_to_scala():

    model_image = "model-abc-123"
    model_path = S3DataPath("my-bucket", "model-abc-123")
    role_arn = "role-789"
    endpoint_instance_type = "c4.8xlarge"

    model = SageMakerModel(
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=2,
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
        modelImage=model_image,
        modelPath=model_path,
        modelEnvironmentVariables=None,
        modelExecutionRoleARN=role_arn,
        endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE,
        sagemakerClient=SageMakerClients.create_sagemaker_client(),
        prependResultRows=False,
        namePolicy=None,
        uid="uid")

    assert model.modelImage == model_image
    assert model.modelPath.bucket == model_path.bucket
    assert model.modelExecutionRoleARN == role_arn
    assert model.endpointInstanceType == endpoint_instance_type
    assert model.existingEndpointName is None
Example #2
0
def test_sagemakermodel_can_do_resource_cleanup():
    endpoint_name = "my-existing-endpoint-123"
    model = SageMakerModel(
        endpointInstanceType="x1.128xlarge",
        endpointInitialInstanceCount=2,
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
        existingEndpointName=endpoint_name,
        modelImage="some_image",
        modelPath=S3DataPath("a", "b"),
        modelEnvironmentVariables=None,
        modelExecutionRoleARN="role",
        endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE,
        sagemakerClient=SageMakerClients.create_sagemaker_client(),
        prependResultRows=False,
        namePolicy=None,
        uid="uid")

    sm = model.sagemakerClient
    assert sm is not None

    resource_cleanup = SageMakerResourceCleanup(sm)
    assert resource_cleanup is not None

    created_resources = model.getCreatedResources()
    assert created_resources is not None

    resource_cleanup.deleteResources(created_resources)
Example #3
0
def test_sagemakerestimator_default_params():
    training_image = "train-abc-123"
    model_image = "model-abc-123"
    training_instance_count = 2
    training_instance_type = "train-abc-123"
    endpoint_instance_type = "endpoint-abc-123"
    endpoint_initial_instance_count = 2

    estimator = SageMakerEstimator(
        trainingImage=training_image,
        modelImage=model_image,
        trainingInstanceCount=training_instance_count,
        trainingInstanceType=training_instance_type,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer()
    )

    assert estimator.trainingInstanceVolumeSizeInGB == 1024
    assert estimator.trainingProjectedColumns is None
    assert estimator.trainingChannelName == "train"
    assert estimator.trainingContentType is None
    assert estimator.trainingS3DataDistribution == "ShardedByS3Key"
    assert estimator.trainingSparkDataFormat == "sagemaker"
    assert estimator.trainingInputMode == "File"
    assert estimator.trainingCompressionCodec is None
    assert estimator.trainingMaxRuntimeInSeconds == 24 * 60 * 60
    assert estimator.trainingKmsKeyId is None
    assert estimator.modelPrependInputRowsToTransformationRows is True
    assert estimator.deleteStagingDataAfterTraining is True
    assert estimator.latestTrainingJob is None
Example #4
0
def test_sagemakerestimator_passes_correct_params_to_scala():
    training_image = "train-abc-123"
    model_image = "model-abc-123"
    training_instance_count = 2
    training_instance_type = "train-abc-123"
    endpoint_instance_type = "c4.8xlarge"
    endpoint_initial_instance_count = 2

    estimator = SageMakerEstimator(
        trainingImage=training_image,
        modelImage=model_image,
        trainingInstanceCount=training_instance_count,
        trainingInstanceType=training_instance_type,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer()
    )

    assert estimator.trainingImage == training_image
    assert estimator.modelImage == model_image
    assert estimator.trainingInstanceType == training_instance_type
    assert estimator.trainingInstanceCount == training_instance_count
    assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count
    assert estimator.endpointInstanceType == endpoint_instance_type
Example #5
0
    def __init__(
            self,
            trainingInstanceType,
            trainingInstanceCount,
            endpointInstanceType,
            endpointInitialInstanceCount,
            sagemakerRole=IAMRoleFromConfig(),
            requestRowSerializer=ProtobufRequestRowSerializer(),
            responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
            trainingInputS3DataPath=S3AutoCreatePath(),
            trainingOutputS3DataPath=S3AutoCreatePath(),
            trainingInstanceVolumeSizeInGB=1024,
            trainingProjectedColumns=None,
            trainingChannelName="train",
            trainingContentType=None,
            trainingS3DataDistribution="ShardedByS3Key",
            trainingSparkDataFormat="sagemaker",
            trainingSparkDataFormatOptions=None,
            trainingInputMode="File",
            trainingCompressionCodec=None,
            trainingMaxRuntimeInSeconds=24 * 60 * 60,
            trainingKmsKeyId=None,
            modelEnvironmentVariables=None,
            endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_CONSTRUCT,
            sagemakerClient=SageMakerClients.create_sagemaker_client(),
            region=None,
            s3Client=SageMakerClients.create_s3_default_client(),
            stsClient=SageMakerClients.create_sts_default_client(),
            modelPrependInputRowsToTransformationRows=True,
            deleteStagingDataAfterTraining=True,
            namePolicyFactory=RandomNamePolicyFactory(),
            uid=None):

        if trainingSparkDataFormatOptions is None:
            trainingSparkDataFormatOptions = {}

        if modelEnvironmentVariables is None:
            modelEnvironmentVariables = {}

        if uid is None:
            uid = Identifiable._randomUID()

        kwargs = locals().copy()
        del kwargs['self']

        super(KMeansSageMakerEstimator, self).__init__(**kwargs)

        default_params = {'k': 2}

        self._setDefault(**default_params)
Example #6
0
def test_sagemakermodel_can_be_created_from_java_obj():
    endpoint_name = "my-existing-endpoint-123"
    model = SageMakerModel(
        endpointInstanceType="x1.128xlarge",
        endpointInitialInstanceCount=2,
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
        existingEndpointName=endpoint_name,
        modelImage="some_image",
        modelPath=S3DataPath("a", "b"),
        modelEnvironmentVariables=None,
        modelExecutionRoleARN="role",
        endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE,
        sagemakerClient=SageMakerClients.create_sagemaker_client(),
        prependResultRows=False,
        namePolicy=None,
        uid="uid")

    new_model = SageMakerModel._from_java(model._to_java())
    assert new_model.uid == model.uid
def test_kmeansSageMakerEstimator_passes_correct_params_to_scala():

    training_instance_type = "c4.8xlarge"
    training_instance_count = 3
    endpoint_instance_type = "c4.8xlarge"
    endpoint_initial_instance_count = 3

    training_bucket = "random-bucket"
    input_prefix = "kmeans-training"
    output_prefix = "kmeans-out"
    integTestingRole = "arn:aws:iam::123456789:role/SageMakerRole"

    estimator = KMeansSageMakerEstimator(
        trainingInstanceType=training_instance_type,
        trainingInstanceCount=training_instance_count,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        sagemakerRole=IAMRole(integTestingRole),
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
        trainingInstanceVolumeSizeInGB=2048,
        trainingInputS3DataPath=S3DataPath(training_bucket, input_prefix),
        trainingOutputS3DataPath=S3DataPath(training_bucket, output_prefix),
        trainingMaxRuntimeInSeconds=1,
        endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM,
        s3Client=SageMakerClients.create_s3_default_client(),
        stsClient=SageMakerClients.create_sts_default_client(),
        modelPrependInputRowsToTransformationRows=True,
        namePolicyFactory=RandomNamePolicyFactory(),
        uid="sagemaker")

    assert estimator.trainingInputS3DataPath.bucket == training_bucket
    assert estimator.trainingInputS3DataPath.objectPath == input_prefix
    assert estimator.trainingInstanceCount == training_instance_count
    assert estimator.trainingInstanceType == training_instance_type
    assert estimator.endpointInstanceType == endpoint_instance_type
    assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count
    assert estimator.trainingInstanceVolumeSizeInGB == 2048
    assert estimator.trainingMaxRuntimeInSeconds == 1
    assert estimator.trainingKmsKeyId is None
#Create a custom SageMakerEstimator
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker_pyspark import SageMakerEstimator
from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer
from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer
from sagemaker_pyspark import IAMRole
from sagemaker_pyspark import RandomNamePolicyFactory
from sagemaker_pyspark import EndpointCreationPolicy

# Create an Estimator from scratch
estimator = SageMakerEstimator(
    trainingImage=get_image_uri(region, 'kmeans'),  # Training image 
    modelImage=get_image_uri(region, 'kmeans'),  # Model image
    requestRowSerializer=ProtobufRequestRowSerializer(),
    responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
    hyperParameters={
        "k": "10",
        "feature_dim": "784"
    },  # Set parameters for K-Means
    sagemakerRole=IAMRole(role),
    trainingInstanceType="ml.m4.xlarge",
    trainingInstanceCount=1,
    endpointInstanceType="ml.t2.medium",
    endpointInitialInstanceCount=1,
    trainingSparkDataFormat="sagemaker",
    namePolicyFactory=RandomNamePolicyFactory("sparksm-4-"),
    endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM)

customModel = estimator.fit(trainingData)
Example #9
0
print(ENDPOINT_NAME)
from sagemaker_pyspark import SageMakerModel
from sagemaker_pyspark import EndpointCreationPolicy
from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer
from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer

attachedModel = SageMakerModel(
    existingEndpointName=ENDPOINT_NAME,
    endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE,
    endpointInstanceType=None,  # Required
    endpointInitialInstanceCount=None,  # Required
    requestRowSerializer=ProtobufRequestRowSerializer(
        featuresColumnName="features"),  # Optional: already default value
    responseRowDeserializer=
    KMeansProtobufResponseRowDeserializer(  # Optional: already default values
        distance_to_cluster_column_name="distance_to_cluster",
        closest_cluster_column_name="closest_cluster"))

transformedData2 = attachedModel.transform(testData)
transformedData2.show()

#Create model and endpoint from model data
from sagemaker_pyspark import S3DataPath

MODEL_S3_PATH = S3DataPath(initialModel.modelPath.bucket,
                           initialModel.modelPath.objectPath)
MODEL_ROLE_ARN = initialModel.modelExecutionRoleARN
MODEL_IMAGE_PATH = initialModel.modelImage

print(MODEL_S3_PATH.bucket + MODEL_S3_PATH.objectPath)
print(MODEL_ROLE_ARN)