def get_pca_estimator():
    training_instance_type = "c4.8xlarge"
    training_instance_count = 3
    endpoint_instance_type = "c4.8xlarge"
    endpoint_initial_instance_count = 3
    estimator = PCASageMakerEstimator(
        trainingInstanceType=training_instance_type,
        trainingInstanceCount=training_instance_count,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        sagemakerRole=IAMRole("some-role"),
        endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM)
    return estimator
def get_linear_learner_binary_classifier():
    training_instance_type = "c4.8xlarge"
    training_instance_count = 3
    endpoint_instance_type = "c4.8xlarge"
    endpoint_initial_instance_count = 3
    estimator = LinearLearnerBinaryClassifier(
        trainingInstanceType=training_instance_type,
        trainingInstanceCount=training_instance_count,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        sagemakerRole=IAMRole("some-role"),
        trainingProjectedColumns=None,
        trainingS3DataDistribution="by-key",
        trainingInputMode="File",
        endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM,
        modelPrependInputRowsToTransformationRows=True,
        namePolicyFactory=RandomNamePolicyFactory(),
        uid="sagemaker")
    return estimator
def train_tffm(roleArn, image_uri, header_file_bucket, header_file_prefix, train_instance_type, endpoint_instance_type, train_df):

    estimator = SageMakerEstimator(
        trainingImage = image_uri,
        modelImage = image_uri,
        trainingInstanceType = train_instance_type,
        trainingInstanceCount = 1,
        endpointInstanceType = endpoint_instance_type,
        endpointInitialInstanceCount = 1,
        requestRowSerializer = ProtobufRequestRowSerializer(),
        responseRowDeserializer = LinearLearnerRegressorProtobufResponseRowDeserializer(),
        hyperParameters = {"order": "3", "rank": "7", "epochs": "50", "header_file_bucket": header_file_bucket, "header_file_prefix": header_file_prefix },
        trainingInstanceVolumeSizeInGB = 1024,
        trainingSparkDataFormat='sagemaker',
        sagemakerRole=IAMRole(roleArn)
        )

    model = estimator.fit(train_df)
    return model
def test_linearLearnerBinaryClassifier_passes_correct_params_to_scala():

    training_instance_type = "c4.8xlarge"
    training_instance_count = 3
    endpoint_instance_type = "c4.8xlarge"
    endpoint_initial_instance_count = 3

    training_bucket = "random-bucket"
    input_prefix = "linear-learner-binary-classifier-training"
    output_prefix = "linear-learner-binary-classifier-out"
    integTestingRole = "arn:aws:iam::123456789:role/SageMakerRole"

    estimator = LinearLearnerBinaryClassifier(
        trainingInstanceType=training_instance_type,
        trainingInstanceCount=training_instance_count,
        endpointInstanceType=endpoint_instance_type,
        endpointInitialInstanceCount=endpoint_initial_instance_count,
        sagemakerRole=IAMRole(integTestingRole),
        requestRowSerializer=ProtobufRequestRowSerializer(),
        responseRowDeserializer=
        LinearLearnerBinaryClassifierProtobufResponseRowDeserializer(),
        trainingInstanceVolumeSizeInGB=2048,
        trainingInputS3DataPath=S3DataPath(training_bucket, input_prefix),
        trainingOutputS3DataPath=S3DataPath(training_bucket, output_prefix),
        trainingMaxRuntimeInSeconds=1,
        endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM,
        sagemakerClient=SageMakerClients.create_sagemaker_client(),
        s3Client=SageMakerClients.create_s3_default_client(),
        stsClient=SageMakerClients.create_sts_default_client(),
        modelPrependInputRowsToTransformationRows=True,
        namePolicyFactory=RandomNamePolicyFactory(),
        uid="sagemaker")

    assert estimator.trainingInputS3DataPath.bucket == training_bucket
    assert estimator.trainingInputS3DataPath.objectPath == input_prefix
    assert estimator.trainingInstanceCount == training_instance_count
    assert estimator.trainingInstanceType == training_instance_type
    assert estimator.endpointInstanceType == endpoint_instance_type
    assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count
    assert estimator.trainingInstanceVolumeSizeInGB == 2048
    assert estimator.trainingMaxRuntimeInSeconds == 1
    assert estimator.trainingKmsKeyId is None
Exemple #5
0
# pyspark --packages com.amazonaws:sagemaker-spark_2.11:spark_2.1.1-1.0

# Train, deploy and invoke a standalone SageMaker model from Spark: MNIST (784) --> KMeans (10)

from sagemaker_pyspark import IAMRole
from sagemaker_pyspark.algorithms import KMeansSageMakerEstimator

iam_role = "arn:aws:iam::ACCOUNT_NUMBER:role/ROLE_NAME"
region = "us-east-1"
training_data = spark.read.format("libsvm").option("numFeatures", "784").load(
    "s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region))

test_data = spark.read.format("libsvm").option("numFeatures", "784").load(
    "s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region))

kmeans_estimator = KMeansSageMakerEstimator(
    trainingInstanceType="ml.m4.xlarge",
    trainingInstanceCount=1,
    endpointInstanceType="ml.m4.xlarge",
    endpointInitialInstanceCount=1,
    sagemakerRole=IAMRole(iam_role))

kmeans_estimator.setK(10)
kmeans_estimator.setFeatureDim(784)

kmeans_model = kmeans_estimator.fit(training_data)

transformed_data = kmeans_model.transform(test_data)
transformed_data.show()
from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer
from sagemaker_pyspark import IAMRole
from sagemaker_pyspark import RandomNamePolicyFactory
from sagemaker_pyspark import EndpointCreationPolicy

# Create an Estimator from scratch
estimator = SageMakerEstimator(
    trainingImage=get_image_uri(region, 'kmeans'),  # Training image 
    modelImage=get_image_uri(region, 'kmeans'),  # Model image
    requestRowSerializer=ProtobufRequestRowSerializer(),
    responseRowDeserializer=KMeansProtobufResponseRowDeserializer(),
    hyperParameters={
        "k": "10",
        "feature_dim": "784"
    },  # Set parameters for K-Means
    sagemakerRole=IAMRole(role),
    trainingInstanceType="ml.m4.xlarge",
    trainingInstanceCount=1,
    endpointInstanceType="ml.t2.medium",
    endpointInitialInstanceCount=1,
    trainingSparkDataFormat="sagemaker",
    namePolicyFactory=RandomNamePolicyFactory("sparksm-4-"),
    endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM)

customModel = estimator.fit(trainingData)

#Inference
transformedData = customModel.transform(testData)
transformedData.show()

from pyspark.sql.types import DoubleType
Exemple #7
0
# pyspark --packages com.amazonaws:sagemaker-spark_2.11:spark_2.1.1-1.0

# Train, deploy and invoke a standalone SageMaker model from Spark: XGBoost

from sagemaker_pyspark import IAMRole
from sagemaker_pyspark.algorithms import XGBoostSageMakerEstimator

iam_role = "arn:aws:iam::ACCOUNT_NUMBER:role/ROLE_NAME"
region = "us-east-1"

training_data=spark.read.format("libsvm").option("numFeatures","784").load("s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region))
test_data = spark.read.format("libsvm").option("numFeatures","784").load("s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region))

xgboost_estimator = XGBoostSageMakerEstimator(trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.m4.xlarge", endpointInitialInstanceCount=1, sagemakerRole=IAMRole(iam_role))

xgboost_estimator.setObjective('multi:softmax')
xgboost_estimator.setNumRound(25)
xgboost_estimator.setNumClasses(10)

xgboost_model = xgboost_estimator.fit(training_data)

transformed_data = xgboost_model.transform(test_data)
transformed_data.show()