def get_pca_estimator(): training_instance_type = "c4.8xlarge" training_instance_count = 3 endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 3 estimator = PCASageMakerEstimator( trainingInstanceType=training_instance_type, trainingInstanceCount=training_instance_count, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, sagemakerRole=IAMRole("some-role"), endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM) return estimator
def get_linear_learner_binary_classifier(): training_instance_type = "c4.8xlarge" training_instance_count = 3 endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 3 estimator = LinearLearnerBinaryClassifier( trainingInstanceType=training_instance_type, trainingInstanceCount=training_instance_count, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, sagemakerRole=IAMRole("some-role"), trainingProjectedColumns=None, trainingS3DataDistribution="by-key", trainingInputMode="File", endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM, modelPrependInputRowsToTransformationRows=True, namePolicyFactory=RandomNamePolicyFactory(), uid="sagemaker") return estimator
def train_tffm(roleArn, image_uri, header_file_bucket, header_file_prefix, train_instance_type, endpoint_instance_type, train_df): estimator = SageMakerEstimator( trainingImage = image_uri, modelImage = image_uri, trainingInstanceType = train_instance_type, trainingInstanceCount = 1, endpointInstanceType = endpoint_instance_type, endpointInitialInstanceCount = 1, requestRowSerializer = ProtobufRequestRowSerializer(), responseRowDeserializer = LinearLearnerRegressorProtobufResponseRowDeserializer(), hyperParameters = {"order": "3", "rank": "7", "epochs": "50", "header_file_bucket": header_file_bucket, "header_file_prefix": header_file_prefix }, trainingInstanceVolumeSizeInGB = 1024, trainingSparkDataFormat='sagemaker', sagemakerRole=IAMRole(roleArn) ) model = estimator.fit(train_df) return model
def test_linearLearnerBinaryClassifier_passes_correct_params_to_scala(): training_instance_type = "c4.8xlarge" training_instance_count = 3 endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 3 training_bucket = "random-bucket" input_prefix = "linear-learner-binary-classifier-training" output_prefix = "linear-learner-binary-classifier-out" integTestingRole = "arn:aws:iam::123456789:role/SageMakerRole" estimator = LinearLearnerBinaryClassifier( trainingInstanceType=training_instance_type, trainingInstanceCount=training_instance_count, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, sagemakerRole=IAMRole(integTestingRole), requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer= LinearLearnerBinaryClassifierProtobufResponseRowDeserializer(), trainingInstanceVolumeSizeInGB=2048, trainingInputS3DataPath=S3DataPath(training_bucket, input_prefix), trainingOutputS3DataPath=S3DataPath(training_bucket, output_prefix), trainingMaxRuntimeInSeconds=1, endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM, sagemakerClient=SageMakerClients.create_sagemaker_client(), s3Client=SageMakerClients.create_s3_default_client(), stsClient=SageMakerClients.create_sts_default_client(), modelPrependInputRowsToTransformationRows=True, namePolicyFactory=RandomNamePolicyFactory(), uid="sagemaker") assert estimator.trainingInputS3DataPath.bucket == training_bucket assert estimator.trainingInputS3DataPath.objectPath == input_prefix assert estimator.trainingInstanceCount == training_instance_count assert estimator.trainingInstanceType == training_instance_type assert estimator.endpointInstanceType == endpoint_instance_type assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count assert estimator.trainingInstanceVolumeSizeInGB == 2048 assert estimator.trainingMaxRuntimeInSeconds == 1 assert estimator.trainingKmsKeyId is None
# pyspark --packages com.amazonaws:sagemaker-spark_2.11:spark_2.1.1-1.0 # Train, deploy and invoke a standalone SageMaker model from Spark: MNIST (784) --> KMeans (10) from sagemaker_pyspark import IAMRole from sagemaker_pyspark.algorithms import KMeansSageMakerEstimator iam_role = "arn:aws:iam::ACCOUNT_NUMBER:role/ROLE_NAME" region = "us-east-1" training_data = spark.read.format("libsvm").option("numFeatures", "784").load( "s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region)) test_data = spark.read.format("libsvm").option("numFeatures", "784").load( "s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region)) kmeans_estimator = KMeansSageMakerEstimator( trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.m4.xlarge", endpointInitialInstanceCount=1, sagemakerRole=IAMRole(iam_role)) kmeans_estimator.setK(10) kmeans_estimator.setFeatureDim(784) kmeans_model = kmeans_estimator.fit(training_data) transformed_data = kmeans_model.transform(test_data) transformed_data.show()
from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer from sagemaker_pyspark import IAMRole from sagemaker_pyspark import RandomNamePolicyFactory from sagemaker_pyspark import EndpointCreationPolicy # Create an Estimator from scratch estimator = SageMakerEstimator( trainingImage=get_image_uri(region, 'kmeans'), # Training image modelImage=get_image_uri(region, 'kmeans'), # Model image requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), hyperParameters={ "k": "10", "feature_dim": "784" }, # Set parameters for K-Means sagemakerRole=IAMRole(role), trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.t2.medium", endpointInitialInstanceCount=1, trainingSparkDataFormat="sagemaker", namePolicyFactory=RandomNamePolicyFactory("sparksm-4-"), endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM) customModel = estimator.fit(trainingData) #Inference transformedData = customModel.transform(testData) transformedData.show() from pyspark.sql.types import DoubleType
# pyspark --packages com.amazonaws:sagemaker-spark_2.11:spark_2.1.1-1.0 # Train, deploy and invoke a standalone SageMaker model from Spark: XGBoost from sagemaker_pyspark import IAMRole from sagemaker_pyspark.algorithms import XGBoostSageMakerEstimator iam_role = "arn:aws:iam::ACCOUNT_NUMBER:role/ROLE_NAME" region = "us-east-1" training_data=spark.read.format("libsvm").option("numFeatures","784").load("s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region)) test_data = spark.read.format("libsvm").option("numFeatures","784").load("s3a://sagemaker-sample-data-{}/spark/mnist/train/".format(region)) xgboost_estimator = XGBoostSageMakerEstimator(trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.m4.xlarge", endpointInitialInstanceCount=1, sagemakerRole=IAMRole(iam_role)) xgboost_estimator.setObjective('multi:softmax') xgboost_estimator.setNumRound(25) xgboost_estimator.setNumClasses(10) xgboost_model = xgboost_estimator.fit(training_data) transformed_data = xgboost_model.transform(test_data) transformed_data.show()