def test_condition_greater_than_or_equal_to(): var = ExecutionVariables.START_DATETIME param = ParameterString(name="StartDateTime") cond = ConditionGreaterThanOrEqualTo(left=var, right=param) assert cond.to_request() == { "Type": "GreaterThanOrEqualTo", "LeftValue": { "Get": "Execution.StartDateTime" }, "RightValue": { "Get": "Parameters.StartDateTime" }, }
def test_model_registration_with_model_repack( sagemaker_session, role, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) step_register = RegisterModel( name="pytorch-register-model", estimator=pytorch_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["*"], response_types=["*"], inference_instances=["*"], transform_instances=["*"], description="test-description", entry_point=entry_point, ) model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="pytorch-model", model=model, inputs=model_inputs, ) step_cond = ConditionStep( name="cond-good-enough", conditions=[ ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1) ], if_steps=[step_train, step_register], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[good_enough_input, instance_count, instance_type], steps=[step_cond], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn) execution = pipeline.start(parameters={}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start(parameters={"GoodEnoughInput": 0}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="CustomerChurnPackageGroup", # Choose any name pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="CustomerChurn", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString( name="ModelApprovalStatus", default_value= "PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value= f"s3://sm-pipelines-demo-data-123456789/churn.txt", # Change this to point to the s3 location of your raw input data. ) # Processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name= f"{base_job_prefix}/sklearn-CustomerChurn-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="CustomerChurnProcess", # choose any name processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain" image_uri = sagemaker.image_uris.retrieve( framework= "xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/CustomerChurn-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( objective="binary:logistic", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="CustomerChurnTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # Processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="CustomerChurnEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # Register model step that will be conditionally executed model_metrics = ModelMetrics(model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json", )) # Register model step that will be conditionally executed step_register = RegisterModel( name="CustomerChurnRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=step_eval, property_file=evaluation_report, json_path= "binary_classification_metrics.accuracy.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=0.8, # You can change the threshold here ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
destination="/opt/ml/processing/test") ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ProcessingOutput(output_name="mlmodel", source="/opt/ml/processing/mlmodel"), ProcessingOutput(output_name="eval_images", source="/opt/ml/processing/eval_images") ], code=f"{conf.source_dir}/evaluation.py", property_files=[evaluation_report]) cond_map = ConditionGreaterThanOrEqualTo( left=JsonGet(step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mAP.value"), right=conf.model_approval_map_threshold) model_metrics = ModelMetrics( model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"] ["S3Uri"]), content_type="application/json")) step_register = RegisterModel( name="BittiRegisterModel", estimator=tf_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["application/octet-stream"], response_types=["application/octet-stream"],
def get_pipeline(region, role, default_bucket, pipeline_name, model_package_group_name, base_job_prefix): """Gets a SageMaker ML Pipeline instance working with BERT. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts pipeline_name: name of this pipeline model_package_group_name: model package group base_job_prefix: prefic of the job name Returns: an instance of a pipeline """ sm = boto3.Session().client(service_name="sagemaker", region_name=region) input_data = ParameterString( name="InputDataUrl", default_value="s3://{}/amazon-reviews-pds/tsv/".format(bucket), ) processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.c5.2xlarge") max_seq_length = ParameterInteger( name="MaxSeqLength", default_value=64, ) balance_dataset = ParameterString( name="BalanceDataset", default_value="True", ) train_split_percentage = ParameterFloat( name="TrainSplitPercentage", default_value=0.90, ) validation_split_percentage = ParameterFloat( name="ValidationSplitPercentage", default_value=0.05, ) test_split_percentage = ParameterFloat( name="TestSplitPercentage", default_value=0.05, ) feature_store_offline_prefix = ParameterString( name="FeatureStoreOfflinePrefix", default_value="reviews-feature-store-" + str(timestamp), ) feature_group_name = ParameterString( name="FeatureGroupName", default_value="reviews-feature-group-" + str(timestamp)) train_instance_type = ParameterString(name="TrainInstanceType", default_value="ml.c5.9xlarge") train_instance_count = ParameterInteger(name="TrainInstanceCount", default_value=1) ######################### # PROCESSING STEP ######################### processor = SKLearnProcessor( framework_version="0.23-1", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, env={"AWS_DEFAULT_REGION": region}, max_runtime_in_seconds=7200, ) processing_inputs = [ ProcessingInput( input_name="raw-input-data", source=input_data, destination="/opt/ml/processing/input/data/", s3_data_distribution_type="ShardedByS3Key", ) ] processing_outputs = [ ProcessingOutput( output_name="bert-train", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/train", ), ProcessingOutput( output_name="bert-validation", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/validation", ), ProcessingOutput( output_name="bert-test", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/test", ), ] # TODO: Figure out why the Parameter's are not resolving properly to their native type when user here. # We shouldn't be using `default_value` processing_step = ProcessingStep( name="Processing", processor=processor, inputs=processing_inputs, outputs=processing_outputs, job_arguments=[ "--train-split-percentage", str(train_split_percentage.default_value), "--validation-split-percentage", str(validation_split_percentage.default_value), "--test-split-percentage", str(test_split_percentage.default_value), "--max-seq-length", str(max_seq_length.default_value), "--balance-dataset", str(balance_dataset.default_value), "--feature-store-offline-prefix", str(feature_store_offline_prefix.default_value), "--feature-group-name", str(feature_group_name.default_value), ], code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py"), ) ######################### # TRAINING STEP ######################### epochs = ParameterInteger(name="Epochs", default_value=1) learning_rate = ParameterFloat(name="LearningRate", default_value=0.00001) epsilon = ParameterFloat(name="Epsilon", default_value=0.00000001) train_batch_size = ParameterInteger(name="TrainBatchSize", default_value=128) validation_batch_size = ParameterInteger(name="ValidationBatchSize", default_value=128) test_batch_size = ParameterInteger(name="TestBatchSize", default_value=128) train_steps_per_epoch = ParameterInteger(name="TrainStepsPerEpoch", default_value=50) validation_steps = ParameterInteger(name="ValidationSteps", default_value=50) test_steps = ParameterInteger(name="TestSteps", default_value=50) train_volume_size = ParameterInteger(name="TrainVolumeSize", default_value=1024) use_xla = ParameterString( name="UseXLA", default_value="True", ) use_amp = ParameterString( name="UseAMP", default_value="True", ) freeze_bert_layer = ParameterString( name="FreezeBERTLayer", default_value="False", ) enable_sagemaker_debugger = ParameterString( name="EnableSageMakerDebugger", default_value="False", ) enable_checkpointing = ParameterString( name="EnableCheckpointing", default_value="False", ) enable_tensorboard = ParameterString( name="EnableTensorboard", default_value="False", ) input_mode = ParameterString( name="InputMode", default_value="File", ) run_validation = ParameterString( name="RunValidation", default_value="True", ) run_test = ParameterString( name="RunTest", default_value="False", ) run_sample_predictions = ParameterString( name="RunSamplePredictions", default_value="False", ) metrics_definitions = [ { "Name": "train:loss", "Regex": "loss: ([0-9\\.]+)" }, { "Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)" }, { "Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)" }, { "Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)" }, ] train_src = os.path.join(BASE_DIR, "src") model_path = f"s3://{default_bucket}/{base_job_prefix}/output/model" estimator = TensorFlow( entry_point="tf_bert_reviews.py", source_dir=BASE_DIR, role=role, output_path=model_path, instance_count=train_instance_count, instance_type=train_instance_type, volume_size=train_volume_size, py_version="py37", framework_version="2.3.1", hyperparameters={ "epochs": epochs, "learning_rate": learning_rate, "epsilon": epsilon, "train_batch_size": train_batch_size, "validation_batch_size": validation_batch_size, "test_batch_size": test_batch_size, "train_steps_per_epoch": train_steps_per_epoch, "validation_steps": validation_steps, "test_steps": test_steps, "use_xla": use_xla, "use_amp": use_amp, "max_seq_length": max_seq_length, "freeze_bert_layer": freeze_bert_layer, "enable_sagemaker_debugger": enable_sagemaker_debugger, "enable_checkpointing": enable_checkpointing, "enable_tensorboard": enable_tensorboard, "run_validation": run_validation, "run_test": run_test, "run_sample_predictions": run_sample_predictions, }, input_mode=input_mode, metric_definitions=metrics_definitions, # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute ) training_step = TrainingStep( name="Train", estimator=estimator, inputs={ "train": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-validation"].S3Output.S3Uri, content_type="text/csv", ), "test": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-test"].S3Output.S3Uri, content_type="text/csv", ), }, ) ######################### # EVALUATION STEP ######################### evaluation_processor = SKLearnProcessor( framework_version="0.23-1", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, env={"AWS_DEFAULT_REGION": region}, max_runtime_in_seconds=7200, ) evaluation_report = PropertyFile(name="EvaluationReport", output_name="metrics", path="evaluation.json") evaluation_step = ProcessingStep( name="EvaluateModel", processor=evaluation_processor, code=os.path.join(BASE_DIR, "evaluate_model_metrics.py"), inputs=[ ProcessingInput( source=training_step.properties.ModelArtifacts. S3ModelArtifacts, destination="/opt/ml/processing/input/model", ), ProcessingInput( source=processing_step.properties. ProcessingInputs["raw-input-data"].S3Input.S3Uri, destination="/opt/ml/processing/input/data", ), ], outputs=[ ProcessingOutput(output_name="metrics", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/metrics/"), ], job_arguments=[ "--max-seq-length", str(max_seq_length.default_value), ], property_files=[evaluation_report ], # these cause deserialization issues ) model_metrics = ModelMetrics(model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json", )) ######################### ## REGISTER TRAINED MODEL STEP ######################### model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval") deploy_instance_type = ParameterString(name="DeployInstanceType", default_value="ml.m5.4xlarge") deploy_instance_count = ParameterInteger(name="DeployInstanceCount", default_value=1) inference_image_uri = sagemaker.image_uris.retrieve( framework="tensorflow", region=region, version="2.3.1", py_version="py37", instance_type=deploy_instance_type, image_scope="inference", ) print(inference_image_uri) register_step = RegisterModel( name="RegisterModel", estimator=estimator, image_uri= inference_image_uri, # we have to specify, by default it's using training image model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=[ deploy_instance_type ], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception transform_instances=[deploy_instance_type], model_package_group_name=model_package_group_name, approval_status=model_approval_status, ) ######################### ## CREATE MODEL FOR DEPLOYMENT STEP ######################### model = Model( image_uri=inference_image_uri, model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sess, role=role, ) create_inputs = CreateModelInput(instance_type=deploy_instance_type, ) create_step = CreateModelStep( name="CreateModel", model=model, inputs=create_inputs, ) ######################### ## CONDITION STEP: EVALUATE THE MODEL ######################### min_accuracy_value = ParameterFloat(name="MinAccuracyValue", default_value=0.01) minimum_accuracy_condition = ConditionGreaterThanOrEqualTo( left=JsonGet( step=evaluation_step, property_file=evaluation_report, json_path="metrics.accuracy.value", ), right=min_accuracy_value, # accuracy ) minimum_accuracy_condition_step = ConditionStep( name="AccuracyCondition", conditions=[minimum_accuracy_condition], if_steps=[register_step, create_step], # success, continue with model registration else_steps=[], # fail, end the pipeline ) ######################### ## CREATE PIPELINE ######################### pipeline = Pipeline( name=pipeline_name, parameters=[ input_data, processing_instance_count, processing_instance_type, max_seq_length, balance_dataset, train_split_percentage, validation_split_percentage, test_split_percentage, feature_store_offline_prefix, feature_group_name, train_instance_type, train_instance_count, epochs, learning_rate, epsilon, train_batch_size, validation_batch_size, test_batch_size, train_steps_per_epoch, validation_steps, test_steps, train_volume_size, use_xla, use_amp, freeze_bert_layer, enable_sagemaker_debugger, enable_checkpointing, enable_tensorboard, input_mode, run_validation, run_test, run_sample_predictions, min_accuracy_value, model_approval_status, deploy_instance_type, deploy_instance_count, ], steps=[ processing_step, training_step, evaluation_step, minimum_accuracy_condition_step ], sagemaker_session=sess, ) ######################### ## RETURN PIPELINE ######################### return pipeline
def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket=None, model_package_group_name="restatePackageGroup", # Choose any name pipeline_name="restate-p-XXXXXXXXX", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="restate", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on RE data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.2xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value=f"", # Change this to point to the s3 location of your raw input data. ) data_sources = [] # Sagemaker session sess = sagemaker_session # You can configure this with your own bucket name, e.g. # bucket = "my-bucket" bucket = sess.default_bucket() data_sources.append( ProcessingInput( input_name="restate-california", dataset_definition=DatasetDefinition( local_path="/opt/ml/processing/restate-california", data_distribution_type="FullyReplicated", # You can override below to point to other database or use different queries athena_dataset_definition=AthenaDatasetDefinition( catalog="AwsDataCatalog", database="restate", query_string="SELECT * FROM restate.california_10", output_s3_uri=f"s3://{bucket}/athena/", output_format="PARQUET", ), ), ) ) print(f"Data Wrangler export storage bucket: {bucket}") # unique flow export ID flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}" flow_export_name = f"flow-{flow_export_id}" # Output name is auto-generated from the select node's ID + output name from the flow file. output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default" s3_output_prefix = f"export-{flow_export_name}/output" s3_output_path = f"s3://{bucket}/{s3_output_prefix}" print(f"Flow S3 export result path: {s3_output_path}") processing_job_output = ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=s3_output_path, s3_upload_mode="EndOfJob", ) # name of the flow file which should exist in the current notebook working directory flow_file_name = "sagemaker-pipeline/restate-athena-california.flow" # Load .flow file from current notebook working directory #!echo "Loading flow file from current notebook working directory: $PWD" with open(flow_file_name) as f: flow = json.load(f) # Upload flow to S3 s3_client = boto3.client("s3") s3_client.upload_file( flow_file_name, bucket, f"data_wrangler_flows/{flow_export_name}.flow", ExtraArgs={"ServerSideEncryption": "aws:kms"}, ) flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow" print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}") ## Input - Flow: restate-athena-russia.flow flow_input = ProcessingInput( source=flow_s3_uri, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) # IAM role for executing the processing job. iam_role = role # Unique processing job name. Give a unique name every time you re-execute processing jobs processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}" # Data Wrangler Container URL. container_uri = sagemaker.image_uris.retrieve( framework="data-wrangler", # we are using the Sagemaker built in xgboost algorithm region=region, ) # Processing Job Instance count and instance type. instance_count = 2 instance_type = "ml.m5.4xlarge" # Size in GB of the EBS volume to use for storing data during processing volume_size_in_gb = 30 # Content type for each output. Data Wrangler supports CSV as default and Parquet. output_content_type = "CSV" # Network Isolation mode; default is off enable_network_isolation = False # List of tags to be passed to the processing job user_tags = [] # Output configuration used as processing job container arguments output_config = {output_name: {"content_type": output_content_type}} # KMS key for per object encryption; default is None kms_key = None processor = Processor( role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type, volume_size_in_gb=volume_size_in_gb, network_config=NetworkConfig(enable_network_isolation=enable_network_isolation), sagemaker_session=sess, output_kms_key=kms_key, tags=user_tags, ) data_wrangler_step = ProcessingStep( name="DataWranglerProcess", processor=processor, inputs=[flow_input] + data_sources, outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) # Processing step for feature engineering # this processor does not have awswrangler installed sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="Preprocess", # choose any name processor=sklearn_processor, inputs=[ ProcessingInput( source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, destination="/opt/ml/processing/data/raw-data-dir", ) ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=[ "--input-data", data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, ], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" cache_config = CacheConfig(enable_caching=True, expire_after="30d") xgb_image_uri = sagemaker.image_uris.retrieve( framework="xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=xgb_image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/restate-xgb-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( # #objective="binary:logistic", # objective="reg:linear", num_round=50, # max_depth=5, # eta=0.2, # gamma=4, # min_child_weight=6, # subsample=0.7, # silent=0, ) xgb_train.set_hyperparameters(grow_policy="lossguide") xgb_objective_metric_name = "validation:mse" xgb_hyperparameter_ranges = { "max_depth": IntegerParameter(2, 10, scaling_type="Linear"), } xgb_tuner_log = HyperparameterTuner( xgb_train, xgb_objective_metric_name, xgb_hyperparameter_ranges, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) xgb_step_tuning = TuningStep( name="XGBHPTune", tuner=xgb_tuner_log, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest' dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version( ImageName="restate-dtree" )["ContainerImage"] dtree_train = Estimator( image_uri=dtree_image_uri, role=role, instance_count=1, instance_type=training_instance_type, base_job_name=f"{base_job_prefix}/restate-dtree-train", output_path=model_path, sagemaker_session=sagemaker_session, ) dtree_objective_metric_name = "validation:mse" dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}] dtree_hyperparameter_ranges = { "max_depth": IntegerParameter(10, 50, scaling_type="Linear"), "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"), } dtree_tuner_log = HyperparameterTuner( dtree_train, dtree_objective_metric_name, dtree_hyperparameter_ranges, dtree_metric_definitions, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) dtree_step_tuning = TuningStep( name="DTreeHPTune", tuner=dtree_tuner_log, inputs={ "training": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) dtree_script_eval = ScriptProcessor( image_uri=dtree_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-dtree-eval", sagemaker_session=sagemaker_session, role=role, ) dtree_evaluation_report = PropertyFile( name="EvaluationReportDTree", output_name="dtree_evaluation", path="dtree_evaluation.json", ) dtree_step_eval = ProcessingStep( name="DTreeEval", processor=dtree_script_eval, inputs=[ ProcessingInput( # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts, source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput( output_name="dtree_evaluation", source="/opt/ml/processing/evaluation" ), ], code=os.path.join(BASE_DIR, "dtree_evaluate.py"), property_files=[dtree_evaluation_report], ) xgb_script_eval = ScriptProcessor( image_uri=xgb_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-xgb-eval", sagemaker_session=sagemaker_session, role=role, ) xgb_evaluation_report = PropertyFile( name="EvaluationReportXGBoost", output_name="xgb_evaluation", path="xgb_evaluation.json", ) xgb_step_eval = ProcessingStep( name="XGBEval", processor=xgb_script_eval, inputs=[ ProcessingInput( source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "xgb_evaluate.py"), property_files=[xgb_evaluation_report], ) xgb_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/xgb_evaluation.json".format( xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) dtree_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/dtree_evaluation.json".format( dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][ "S3Uri" ] ), content_type="application/json", ) ) xgb_eval_metrics = JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) dtree_eval_metrics = JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) # Register model step that will be conditionally executed dtree_step_register = RegisterModel( name="DTreeReg", estimator=dtree_train, model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=dtree_model_metrics, ) # Register model step that will be conditionally executed xgb_step_register = RegisterModel( name="XGBReg", estimator=xgb_train, model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=xgb_model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), # You can change the threshold here ) step_cond = ConditionStep( name="AccuracyCond", conditions=[cond_lte], if_steps=[dtree_step_register], else_steps=[xgb_step_register], ) create_date = time.strftime("%Y-%m-%d-%H-%M-%S") # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data ], pipeline_experiment_config=PipelineExperimentConfig( pipeline_name + "-" + create_date, "restate-{}".format(create_date) ), steps=[ data_wrangler_step, step_process, dtree_step_tuning, xgb_step_tuning, dtree_step_eval, xgb_step_eval, step_cond, ], sagemaker_session=sagemaker_session, ) return pipeline
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="AbalonePackageGroup", pipeline_name="AbalonePipeline", base_job_prefix="Abalone", ): """Gets a SageMaker ML Pipeline instance working with on abalone data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Create cache configuration cache_config = CacheConfig(enable_caching=True, expire_after="T30m") # Create SKlean processor object sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name="credit-processing-job" ) # Use the sklearn_processor in a Sagemaker pipelines ProcessingStep step_preprocess_data = ProcessingStep( name="PreprocessCreditData", processor=sklearn_processor, cache_config=cache_config, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"), ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline") ], code=os.path.join(BASE_DIR, "preprocessing.py"), ) # Where to store the trained model model_path = f"s3://{default_bucket}/CreditTrain" # Fetch container to use for training image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.2-2", py_version="py3", instance_type=training_instance_type, ) # Create XGBoost estimator object xgb_estimator = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, disable_profiler=True, ) # Specify hyperparameters xgb_estimator.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.8, objective='binary:logistic', num_round=25) # Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. # NOTE how the input to the training job directly references the output of the previous step. step_train_model = TrainingStep( name="TrainCreditModel", estimator=xgb_estimator, cache_config=cache_config, inputs={ "train": TrainingInput( s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv" ), "validation": TrainingInput( s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv" ) }, ) # Create ScriptProcessor object. evaluate_model_processor = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name="script-credit-eval", role=role, ) # Create a PropertyFile # We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on. # For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) # Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. step_evaluate_model = ProcessingStep( name="EvaluateCreditModel", processor=evaluate_model_processor, cache_config=cache_config, inputs=[ ProcessingInput( source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model" ), ProcessingInput( source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test" ) ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluation.py"), property_files=[evaluation_report], ) model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json" ) ) # Crete a RegisterModel step, which registers your model with Sagemaker Model Registry. step_register_model = RegisterModel( name="RegisterCreditModel", estimator=xgb_estimator, model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics ) # Create Processor object using the model monitor image baseline_processor = sagemaker.processing.Processor( base_job_name="credit-risk-baseline-processor", image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'), role=role, instance_count=1, instance_type=processing_instance_type, env = { "dataset_format": "{\"csv\": {\"header\": true} }", "dataset_source": "/opt/ml/processing/sm_input", "output_path": "/opt/ml/processing/sm_output", "publish_cloudwatch_metrics": "Disabled" } ) # Create a Sagemaker Pipeline step, using the baseline_processor. step_create_data_baseline = ProcessingStep( name="CreateModelQualityBaseline", processor=baseline_processor, cache_config=cache_config, inputs=[ ProcessingInput( source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "baseline_with_headers" ].S3Output.S3Uri, destination="/opt/ml/processing/sm_input", ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/sm_output", destination="s3://{}/{}/baseline".format(default_bucket, base_job_prefix), output_name="baseline_result", ) ], ) # Create Condition cond_gte = ConditionGreaterThanOrEqualTo( left=JsonGet( step=step_evaluate_model, property_file=evaluation_report, json_path="binary_classification_metrics.accuracy.value" ), right=0.7 ) # Create a Sagemaker Pipelines ConditionStep, using the condition we just created. step_cond = ConditionStep( name="AccuracyCondition", conditions=[cond_gte], if_steps=[step_register_model], else_steps=[], ) from sagemaker.workflow.pipeline import Pipeline # Create a Sagemaker Pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond], ) return pipeline
def test_steps_with_map_params_pipeline( sagemaker_session, role, script_dir, pipeline_name, region_name, athena_dataset_definition, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) framework_version = "0.20.0" instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") output_prefix = ParameterString(name="OutputPrefix", default_value="output") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput( output_name="test_data", source="/opt/ml/processing/test", destination=Join( on="/", values=[ "s3:/", sagemaker_session.default_bucket(), "test-sklearn", output_prefix, ExecutionVariables.PIPELINE_EXECUTION_ID, ], ), ), ], code=os.path.join(script_dir, "preprocessing.py"), ) sklearn_train = SKLearn( framework_version=framework_version, entry_point=os.path.join(script_dir, "train.py"), instance_type=instance_type, sagemaker_session=sagemaker_session, role=role, hyperparameters={ "batch-size": 500, "epochs": 5, }, ) step_train = TrainingStep( name="my-train", display_name="TrainingStep", description="description for Training step", estimator=sklearn_train, inputs=TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train_data" ].S3Output.S3Uri ), ) model = Model( image_uri=sklearn_train.image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="my-model", display_name="ModelStep", description="description for Model step", model=model, inputs=model_inputs, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( left=step_train.properties.HyperParameters["batch-size"], right=6.0, ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count, output_prefix], steps=[step_process, step_train, step_cond], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) assert definition["Version"] == "2020-12-01" steps = definition["Steps"] assert len(steps) == 3 training_args = {} condition_args = {} for step in steps: if step["Type"] == "Training": training_args = step["Arguments"] if step["Type"] == "Condition": condition_args = step["Arguments"] assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" } assert condition_args["Conditions"][0]["LeftValue"] == { "Get": "Steps.my-train.HyperParameters['batch-size']" } try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass