def __init__(self, name: str, model: Model, inputs: CreateModelInput, depends_on: List[str] = None): """Construct a CreateModelStep, given an `sagemaker.model.Model` instance. In addition to the Model instance, the other arguments are those that are supplied to the `_create_sagemaker_model` method of the `sagemaker.model.Model._create_sagemaker_model`. Args: name (str): The name of the CreateModel step. model (Model): A `sagemaker.model.Model` instance. inputs (CreateModelInput): A `sagemaker.inputs.CreateModelInput` instance. Defaults to `None`. depends_on (List[str]): A list of step names this `sagemaker.workflow.steps.CreateModelStep` depends on """ super(CreateModelStep, self).__init__(name, StepTypeEnum.CREATE_MODEL, depends_on) self.model = model self.inputs = inputs or CreateModelInput() self._properties = Properties(path=f"Steps.{name}", shape_name="DescribeModelOutput")
def test_create_model_step_with_model_pipeline(tfo, time, sagemaker_session): framework_model = DummyFrameworkModel(sagemaker_session) sparkml_model = SparkMLModel( model_data="s3://bucket/model_2.tar.gz", role=ROLE, sagemaker_session=sagemaker_session, env={"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"}, ) model = PipelineModel(models=[framework_model, sparkml_model], role=ROLE, sagemaker_session=sagemaker_session) inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) step = CreateModelStep( name="MyCreateModelStep", depends_on=["TestStep"], display_name="MyCreateModelStep", description="TestDescription", model=model, inputs=inputs, ) step.add_depends_on(["SecondTestStep"]) assert step.to_request() == { "Name": "MyCreateModelStep", "Type": "Model", "Description": "TestDescription", "DisplayName": "MyCreateModelStep", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": { "Containers": [ { "Environment": { "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_SUBMIT_DIRECTORY": "s3://my-bucket/mi-1-2017-10-10-14-14-15/sourcedir.tar.gz", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_REGION": "us-west-2", }, "Image": "mi-1", "ModelDataUrl": "s3://bucket/model_1.tar.gz", }, { "Environment": { "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv" }, "Image": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sparkml-serving:2.4", "ModelDataUrl": "s3://bucket/model_2.tar.gz", }, ], "ExecutionRoleArn": "DummyRole", }, } assert step.properties.ModelName.expr == { "Get": "Steps.MyCreateModelStep.ModelName" }
def test_create_model_step(sagemaker_session): model = Model( image_uri=IMAGE_URI, role=ROLE, sagemaker_session=sagemaker_session, ) inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) step = CreateModelStep( name="MyCreateModelStep", depends_on=["TestStep"], model=model, inputs=inputs, ) step.add_depends_on(["SecondTestStep"]) assert step.to_request() == { "Name": "MyCreateModelStep", "Type": "Model", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage" }, }, } assert step.properties.ModelName.expr == { "Get": "Steps.MyCreateModelStep.ModelName" }
def __init__( self, name: str, model: Union[Model, PipelineModel], inputs: CreateModelInput = None, depends_on: Union[List[str], List[Step]] = None, retry_policies: List[RetryPolicy] = None, display_name: str = None, description: str = None, ): """Construct a `CreateModelStep`, given an `sagemaker.model.Model` instance. In addition to the `Model` instance, the other arguments are those that are supplied to the `_create_sagemaker_model` method of the `sagemaker.model.Model._create_sagemaker_model`. Args: name (str): The name of the `CreateModelStep`. model (Model or PipelineModel): A `sagemaker.model.Model` or `sagemaker.pipeline.PipelineModel` instance. inputs (CreateModelInput): A `sagemaker.inputs.CreateModelInput` instance. Defaults to `None`. depends_on (List[str] or List[Step]): A list of `Step` names or `Step` instances this `sagemaker.workflow.steps.CreateModelStep` depends on. retry_policies (List[RetryPolicy]): A list of retry policies. display_name (str): The display name of the `CreateModelStep`. description (str): The description of the `CreateModelStep`. """ super(CreateModelStep, self).__init__(name, StepTypeEnum.CREATE_MODEL, display_name, description, depends_on, retry_policies) self.model = model self.inputs = inputs or CreateModelInput() self._properties = Properties(path=f"Steps.{name}", shape_name="DescribeModelOutput")
def test_estimator_transformer(estimator): model_data = f"s3://{BUCKET}/model.tar.gz" model_inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") estimator_transformer = EstimatorTransformer( name="EstimatorTransformerStep", estimator=estimator, model_data=model_data, model_inputs=model_inputs, instance_count=1, instance_type="ml.c4.4xlarge", transform_inputs=transform_inputs, ) request_dicts = estimator_transformer.request_dicts() assert len(request_dicts) == 2 for request_dict in request_dicts: if request_dict["Type"] == "Model": assert request_dict == { "Name": "EstimatorTransformerStepCreateModelStep", "Type": "Model", "Arguments": { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage", "ModelDataUrl": "s3://my-bucket/model.tar.gz", }, }, } elif request_dict["Type"] == "Transform": assert request_dict[ "Name"] == "EstimatorTransformerStepTransformStep" arguments = request_dict["Arguments"] assert isinstance(arguments["ModelName"], Properties) arguments.pop("ModelName") assert arguments == { "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "ml.c4.4xlarge" }, } else: raise Exception( "A step exists in the collection of an invalid type.")
def __init__( self, name: str, step_args: Optional[dict] = None, model: Optional[Union[Model, PipelineModel]] = None, inputs: Optional[CreateModelInput] = None, depends_on: Optional[List[Union[str, Step, "StepCollection"]]] = None, retry_policies: Optional[List[RetryPolicy]] = None, display_name: Optional[str] = None, description: Optional[str] = None, ): """Construct a `CreateModelStep`, given an `sagemaker.model.Model` instance. In addition to the `Model` instance, the other arguments are those that are supplied to the `_create_sagemaker_model` method of the `sagemaker.model.Model._create_sagemaker_model`. Args: name (str): The name of the `CreateModelStep`. step_args (dict): The arguments for the `CreateModelStep` definition (default: None). model (Model or PipelineModel): A `sagemaker.model.Model` or `sagemaker.pipeline.PipelineModel` instance (default: None). inputs (CreateModelInput): A `sagemaker.inputs.CreateModelInput` instance. (default: None). depends_on (List[Union[str, Step, StepCollection]]): A list of `Step`/`StepCollection` names or `Step` instances or `StepCollection` instances that this `CreateModelStep` depends on (default: None). retry_policies (List[RetryPolicy]): A list of retry policies (default: None). display_name (str): The display name of the `CreateModelStep` (default: None). description (str): The description of the `CreateModelStep` (default: None). """ super(CreateModelStep, self).__init__(name, StepTypeEnum.CREATE_MODEL, display_name, description, depends_on, retry_policies) if not (step_args is None) ^ (model is None): raise ValueError( "step_args and model are mutually exclusive. Either of them should be provided." ) self.step_args = step_args self.model = model self.inputs = inputs or CreateModelInput() self._properties = Properties(path=f"Steps.{name}", shape_name="DescribeModelOutput") # TODO: add public document link here once ready warnings.warn( ("We are deprecating the use of CreateModelStep. " "Instead, please use the ModelStep, which simply takes in the step arguments " "generated by model.create()."), DeprecationWarning, )
def test_model_registration_with_model_repack( sagemaker_session, role, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) step_register = RegisterModel( name="pytorch-register-model", estimator=pytorch_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["*"], response_types=["*"], inference_instances=["*"], transform_instances=["*"], description="test-description", entry_point=entry_point, ) model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="pytorch-model", model=model, inputs=model_inputs, ) step_cond = ConditionStep( name="cond-good-enough", conditions=[ ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1) ], if_steps=[step_train, step_register], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[good_enough_input, instance_count, instance_type], steps=[step_cond], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn) execution = pipeline.start(parameters={}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start(parameters={"GoodEnoughInput": 0}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_three_step_definition( sagemaker_session, region_name, role, script_dir, pipeline_name, athena_dataset_definition, ): framework_version = "0.20.0" instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") instance_count = ParameterInteger(name="InstanceCount", default_value=1) output_prefix = ParameterString(name="OutputPrefix", default_value="output") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="my-process", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput( output_name="test_data", source="/opt/ml/processing/test", destination=Join( on="/", values=[ "s3:/", sagemaker_session.default_bucket(), "test-sklearn", output_prefix, ExecutionVariables.PIPELINE_EXECUTION_ID, ], ), ), ], code=os.path.join(script_dir, "preprocessing.py"), ) sklearn_train = SKLearn( framework_version=framework_version, entry_point=os.path.join(script_dir, "train.py"), instance_type=instance_type, sagemaker_session=sagemaker_session, role=role, ) step_train = TrainingStep( name="my-train", estimator=sklearn_train, inputs=TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train_data"].S3Output.S3Uri), ) model = Model( image_uri=sklearn_train.image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="my-model", model=model, inputs=model_inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count, output_prefix], steps=[step_process, step_train, step_model], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) assert definition["Version"] == "2020-12-01" assert set(tuple(param.items()) for param in definition["Parameters"]) == set([ tuple({ "Name": "InstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge" }.items()), tuple({ "Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1 }.items()), tuple({ "Name": "OutputPrefix", "Type": "String", "DefaultValue": "output" }.items()), ]) steps = definition["Steps"] assert len(steps) == 3 names_and_types = [] processing_args = {} training_args = {} for step in steps: names_and_types.append((step["Name"], step["Type"])) if step["Type"] == "Processing": processing_args = step["Arguments"] if step["Type"] == "Training": training_args = step["Arguments"] if step["Type"] == "Model": model_args = step["Arguments"] assert set(names_and_types) == set([ ("my-process", "Processing"), ("my-train", "Training"), ("my-model", "Model"), ]) assert processing_args["ProcessingResources"]["ClusterConfig"] == { "InstanceType": { "Get": "Parameters.InstanceType" }, "InstanceCount": { "Get": "Parameters.InstanceCount" }, "VolumeSizeInGB": 30, } assert training_args["ResourceConfig"] == { "InstanceCount": 1, "InstanceType": { "Get": "Parameters.InstanceType" }, "VolumeSizeInGB": 30, } assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"][ "S3Uri"] == { "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" } assert model_args["PrimaryContainer"]["ModelDataUrl"] == { "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts" } try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass
def get_pipeline(region, role, default_bucket, pipeline_name, model_package_group_name, base_job_prefix): """Gets a SageMaker ML Pipeline instance working with BERT. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts pipeline_name: name of this pipeline model_package_group_name: model package group base_job_prefix: prefic of the job name Returns: an instance of a pipeline """ sm = boto3.Session().client(service_name="sagemaker", region_name=region) input_data = ParameterString( name="InputDataUrl", default_value="s3://{}/amazon-reviews-pds/tsv/".format(bucket), ) processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.c5.2xlarge") max_seq_length = ParameterInteger( name="MaxSeqLength", default_value=64, ) balance_dataset = ParameterString( name="BalanceDataset", default_value="True", ) train_split_percentage = ParameterFloat( name="TrainSplitPercentage", default_value=0.90, ) validation_split_percentage = ParameterFloat( name="ValidationSplitPercentage", default_value=0.05, ) test_split_percentage = ParameterFloat( name="TestSplitPercentage", default_value=0.05, ) feature_store_offline_prefix = ParameterString( name="FeatureStoreOfflinePrefix", default_value="reviews-feature-store-" + str(timestamp), ) feature_group_name = ParameterString( name="FeatureGroupName", default_value="reviews-feature-group-" + str(timestamp)) train_instance_type = ParameterString(name="TrainInstanceType", default_value="ml.c5.9xlarge") train_instance_count = ParameterInteger(name="TrainInstanceCount", default_value=1) ######################### # PROCESSING STEP ######################### processor = SKLearnProcessor( framework_version="0.23-1", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, env={"AWS_DEFAULT_REGION": region}, max_runtime_in_seconds=7200, ) processing_inputs = [ ProcessingInput( input_name="raw-input-data", source=input_data, destination="/opt/ml/processing/input/data/", s3_data_distribution_type="ShardedByS3Key", ) ] processing_outputs = [ ProcessingOutput( output_name="bert-train", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/train", ), ProcessingOutput( output_name="bert-validation", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/validation", ), ProcessingOutput( output_name="bert-test", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/test", ), ] # TODO: Figure out why the Parameter's are not resolving properly to their native type when user here. # We shouldn't be using `default_value` processing_step = ProcessingStep( name="Processing", processor=processor, inputs=processing_inputs, outputs=processing_outputs, job_arguments=[ "--train-split-percentage", str(train_split_percentage.default_value), "--validation-split-percentage", str(validation_split_percentage.default_value), "--test-split-percentage", str(test_split_percentage.default_value), "--max-seq-length", str(max_seq_length.default_value), "--balance-dataset", str(balance_dataset.default_value), "--feature-store-offline-prefix", str(feature_store_offline_prefix.default_value), "--feature-group-name", str(feature_group_name.default_value), ], code=os.path.join(BASE_DIR, "preprocess-scikit-text-to-bert-feature-store.py"), ) ######################### # TRAINING STEP ######################### epochs = ParameterInteger(name="Epochs", default_value=1) learning_rate = ParameterFloat(name="LearningRate", default_value=0.00001) epsilon = ParameterFloat(name="Epsilon", default_value=0.00000001) train_batch_size = ParameterInteger(name="TrainBatchSize", default_value=128) validation_batch_size = ParameterInteger(name="ValidationBatchSize", default_value=128) test_batch_size = ParameterInteger(name="TestBatchSize", default_value=128) train_steps_per_epoch = ParameterInteger(name="TrainStepsPerEpoch", default_value=50) validation_steps = ParameterInteger(name="ValidationSteps", default_value=50) test_steps = ParameterInteger(name="TestSteps", default_value=50) train_volume_size = ParameterInteger(name="TrainVolumeSize", default_value=1024) use_xla = ParameterString( name="UseXLA", default_value="True", ) use_amp = ParameterString( name="UseAMP", default_value="True", ) freeze_bert_layer = ParameterString( name="FreezeBERTLayer", default_value="False", ) enable_sagemaker_debugger = ParameterString( name="EnableSageMakerDebugger", default_value="False", ) enable_checkpointing = ParameterString( name="EnableCheckpointing", default_value="False", ) enable_tensorboard = ParameterString( name="EnableTensorboard", default_value="False", ) input_mode = ParameterString( name="InputMode", default_value="File", ) run_validation = ParameterString( name="RunValidation", default_value="True", ) run_test = ParameterString( name="RunTest", default_value="False", ) run_sample_predictions = ParameterString( name="RunSamplePredictions", default_value="False", ) metrics_definitions = [ { "Name": "train:loss", "Regex": "loss: ([0-9\\.]+)" }, { "Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)" }, { "Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)" }, { "Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)" }, ] train_src = os.path.join(BASE_DIR, "src") model_path = f"s3://{default_bucket}/{base_job_prefix}/output/model" estimator = TensorFlow( entry_point="tf_bert_reviews.py", source_dir=BASE_DIR, role=role, output_path=model_path, instance_count=train_instance_count, instance_type=train_instance_type, volume_size=train_volume_size, py_version="py37", framework_version="2.3.1", hyperparameters={ "epochs": epochs, "learning_rate": learning_rate, "epsilon": epsilon, "train_batch_size": train_batch_size, "validation_batch_size": validation_batch_size, "test_batch_size": test_batch_size, "train_steps_per_epoch": train_steps_per_epoch, "validation_steps": validation_steps, "test_steps": test_steps, "use_xla": use_xla, "use_amp": use_amp, "max_seq_length": max_seq_length, "freeze_bert_layer": freeze_bert_layer, "enable_sagemaker_debugger": enable_sagemaker_debugger, "enable_checkpointing": enable_checkpointing, "enable_tensorboard": enable_tensorboard, "run_validation": run_validation, "run_test": run_test, "run_sample_predictions": run_sample_predictions, }, input_mode=input_mode, metric_definitions=metrics_definitions, # max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute ) training_step = TrainingStep( name="Train", estimator=estimator, inputs={ "train": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-validation"].S3Output.S3Uri, content_type="text/csv", ), "test": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["bert-test"].S3Output.S3Uri, content_type="text/csv", ), }, ) ######################### # EVALUATION STEP ######################### evaluation_processor = SKLearnProcessor( framework_version="0.23-1", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, env={"AWS_DEFAULT_REGION": region}, max_runtime_in_seconds=7200, ) evaluation_report = PropertyFile(name="EvaluationReport", output_name="metrics", path="evaluation.json") evaluation_step = ProcessingStep( name="EvaluateModel", processor=evaluation_processor, code=os.path.join(BASE_DIR, "evaluate_model_metrics.py"), inputs=[ ProcessingInput( source=training_step.properties.ModelArtifacts. S3ModelArtifacts, destination="/opt/ml/processing/input/model", ), ProcessingInput( source=processing_step.properties. ProcessingInputs["raw-input-data"].S3Input.S3Uri, destination="/opt/ml/processing/input/data", ), ], outputs=[ ProcessingOutput(output_name="metrics", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/metrics/"), ], job_arguments=[ "--max-seq-length", str(max_seq_length.default_value), ], property_files=[evaluation_report ], # these cause deserialization issues ) model_metrics = ModelMetrics(model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json", )) ######################### ## REGISTER TRAINED MODEL STEP ######################### model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval") deploy_instance_type = ParameterString(name="DeployInstanceType", default_value="ml.m5.4xlarge") deploy_instance_count = ParameterInteger(name="DeployInstanceCount", default_value=1) inference_image_uri = sagemaker.image_uris.retrieve( framework="tensorflow", region=region, version="2.3.1", py_version="py37", instance_type=deploy_instance_type, image_scope="inference", ) print(inference_image_uri) register_step = RegisterModel( name="RegisterModel", estimator=estimator, image_uri= inference_image_uri, # we have to specify, by default it's using training image model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=[ deploy_instance_type ], # The JSON spec must be within these instance types or we will see "Instance Type Not Allowed" Exception transform_instances=[deploy_instance_type], model_package_group_name=model_package_group_name, approval_status=model_approval_status, ) ######################### ## CREATE MODEL FOR DEPLOYMENT STEP ######################### model = Model( image_uri=inference_image_uri, model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sess, role=role, ) create_inputs = CreateModelInput(instance_type=deploy_instance_type, ) create_step = CreateModelStep( name="CreateModel", model=model, inputs=create_inputs, ) ######################### ## CONDITION STEP: EVALUATE THE MODEL ######################### min_accuracy_value = ParameterFloat(name="MinAccuracyValue", default_value=0.01) minimum_accuracy_condition = ConditionGreaterThanOrEqualTo( left=JsonGet( step=evaluation_step, property_file=evaluation_report, json_path="metrics.accuracy.value", ), right=min_accuracy_value, # accuracy ) minimum_accuracy_condition_step = ConditionStep( name="AccuracyCondition", conditions=[minimum_accuracy_condition], if_steps=[register_step, create_step], # success, continue with model registration else_steps=[], # fail, end the pipeline ) ######################### ## CREATE PIPELINE ######################### pipeline = Pipeline( name=pipeline_name, parameters=[ input_data, processing_instance_count, processing_instance_type, max_seq_length, balance_dataset, train_split_percentage, validation_split_percentage, test_split_percentage, feature_store_offline_prefix, feature_group_name, train_instance_type, train_instance_count, epochs, learning_rate, epsilon, train_batch_size, validation_batch_size, test_batch_size, train_steps_per_epoch, validation_steps, test_steps, train_volume_size, use_xla, use_amp, freeze_bert_layer, enable_sagemaker_debugger, enable_checkpointing, enable_tensorboard, input_mode, run_validation, run_test, run_sample_predictions, min_accuracy_value, model_approval_status, deploy_instance_type, deploy_instance_count, ], steps=[ processing_step, training_step, evaluation_step, minimum_accuracy_condition_step ], sagemaker_session=sess, ) ######################### ## RETURN PIPELINE ######################### return pipeline
def test_estimator_transformer(estimator): model_data = f"s3://{BUCKET}/model.tar.gz" model_inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) service_fault_retry_policy = StepRetryPolicy( exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10) transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") estimator_transformer = EstimatorTransformer( name="EstimatorTransformerStep", estimator=estimator, model_data=model_data, model_inputs=model_inputs, instance_count=1, instance_type="ml.c4.4xlarge", transform_inputs=transform_inputs, depends_on=["TestStep"], model_step_retry_policies=[service_fault_retry_policy], transform_step_retry_policies=[service_fault_retry_policy], repack_model_step_retry_policies=[service_fault_retry_policy], ) request_dicts = estimator_transformer.request_dicts() assert len(request_dicts) == 2 for request_dict in request_dicts: if request_dict["Type"] == "Model": assert request_dict == { "Name": "EstimatorTransformerStepCreateModelStep", "Type": "Model", "DependsOn": ["TestStep"], "RetryPolicies": [service_fault_retry_policy.to_request()], "Arguments": { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage", "ModelDataUrl": "s3://my-bucket/model.tar.gz", }, }, } elif request_dict["Type"] == "Transform": assert request_dict[ "Name"] == "EstimatorTransformerStepTransformStep" assert request_dict["RetryPolicies"] == [ service_fault_retry_policy.to_request() ] arguments = request_dict["Arguments"] assert isinstance(arguments["ModelName"], Properties) arguments.pop("ModelName") assert "DependsOn" not in request_dict assert arguments == { "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "ml.c4.4xlarge" }, } else: raise Exception( "A step exists in the collection of an invalid type.")
def test_end_to_end_pipeline_successful_execution( sagemaker_session, region_name, role, pipeline_name, wait=False ): model_package_group_name = f"{pipeline_name}ModelPackageGroup" data_path = os.path.join(DATA_DIR, "workflow") default_bucket = sagemaker_session.default_bucket() # download the input data local_input_path = os.path.join(data_path, "abalone-dataset.csv") s3 = sagemaker_session.boto_session.resource("s3") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset.csv", local_input_path ) # # upload the input data to our bucket base_uri = f"s3://{default_bucket}/{pipeline_name}" with open(local_input_path) as data: body = data.read() input_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset.csv", sagemaker_session=sagemaker_session, ) # download batch transform data local_batch_path = os.path.join(data_path, "abalone-dataset-batch") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset-batch", local_batch_path ) # upload the batch transform data with open(local_batch_path) as data: body = data.read() batch_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset-batch", sagemaker_session=sagemaker_session, ) # define parameters processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") input_data = ParameterString( name="InputData", default_value=input_data_uri, ) batch_data = ParameterString( name="BatchData", default_value=batch_data_uri, ) # define processing step framework_version = "0.23-1" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{pipeline_name}-process", role=role, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="AbaloneProcess", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(data_path, "abalone/preprocessing.py"), ) # define training step model_path = f"s3://{default_bucket}/{pipeline_name}Train" image_uri = image_uris.retrieve( framework="xgboost", region=region_name, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, sagemaker_session=sagemaker_session, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="AbaloneTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # define evaluation step script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{pipeline_name}-eval", role=role, sagemaker_session=sagemaker_session, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) step_eval = ProcessingStep( name="AbaloneEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(data_path, "abalone/evaluation.py"), property_files=[evaluation_report], ) # define create model step model = Model( image_uri=image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_create_model = CreateModelStep( name="AbaloneCreateModel", model=model, inputs=inputs, ) # define transform step transformer = Transformer( model_name=step_create_model.properties.ModelName, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{default_bucket}/{pipeline_name}Transform", sagemaker_session=sagemaker_session, ) step_transform = TransformStep( name="AbaloneTransform", transformer=transformer, inputs=TransformInput(data=batch_data), ) # define register model step model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) step_register = RegisterModel( name="AbaloneRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # define condition step cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step_name=step_eval.name, property_file=evaluation_report, json_path="regression_metrics.mse.value", ), right=20.0, ) step_cond = ConditionStep( name="AbaloneMSECond", conditions=[cond_lte], if_steps=[step_register, step_create_model, step_transform], else_steps=[], ) # define pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, batch_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) pipeline.create(role) execution = pipeline.start() execution_arn = execution.arn if wait: execution.wait() return execution_arn
def test_steps_with_map_params_pipeline( sagemaker_session, role, script_dir, pipeline_name, region_name, athena_dataset_definition, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) framework_version = "0.20.0" instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") output_prefix = ParameterString(name="OutputPrefix", default_value="output") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput( output_name="test_data", source="/opt/ml/processing/test", destination=Join( on="/", values=[ "s3:/", sagemaker_session.default_bucket(), "test-sklearn", output_prefix, ExecutionVariables.PIPELINE_EXECUTION_ID, ], ), ), ], code=os.path.join(script_dir, "preprocessing.py"), ) sklearn_train = SKLearn( framework_version=framework_version, entry_point=os.path.join(script_dir, "train.py"), instance_type=instance_type, sagemaker_session=sagemaker_session, role=role, hyperparameters={ "batch-size": 500, "epochs": 5, }, ) step_train = TrainingStep( name="my-train", display_name="TrainingStep", description="description for Training step", estimator=sklearn_train, inputs=TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train_data" ].S3Output.S3Uri ), ) model = Model( image_uri=sklearn_train.image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="my-model", display_name="ModelStep", description="description for Model step", model=model, inputs=model_inputs, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( left=step_train.properties.HyperParameters["batch-size"], right=6.0, ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count, output_prefix], steps=[step_process, step_train, step_cond], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) assert definition["Version"] == "2020-12-01" steps = definition["Steps"] assert len(steps) == 3 training_args = {} condition_args = {} for step in steps: if step["Type"] == "Training": training_args = step["Arguments"] if step["Type"] == "Condition": condition_args = step["Arguments"] assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" } assert condition_args["Conditions"][0]["LeftValue"] == { "Get": "Steps.my-train.HyperParameters['batch-size']" } try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_tuning_single_algo( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, enable_sagemaker_metrics=True, max_retry_attempts=3, ) min_batch_size = ParameterInteger(name="MinBatchSize", default_value=64) max_batch_size = ParameterInteger(name="MaxBatchSize", default_value=128) hyperparameter_ranges = { "batch-size": IntegerParameter(min_batch_size, max_batch_size), } tuner = HyperparameterTuner( estimator=pytorch_estimator, objective_metric_name="test:acc", objective_type="Maximize", hyperparameter_ranges=hyperparameter_ranges, metric_definitions=[{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], max_jobs=2, max_parallel_jobs=2, ) step_tune = TuningStep( name="my-tuning-step", tuner=tuner, inputs=inputs, ) best_model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_tune.get_top_model_s3_uri( top_k=0, s3_bucket=sagemaker_session.default_bucket(), ), sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_best_model = CreateModelStep( name="1st-model", model=best_model, inputs=model_inputs, ) second_best_model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_tune.get_top_model_s3_uri( top_k=1, s3_bucket=sagemaker_session.default_bucket(), ), sagemaker_session=sagemaker_session, role=role, entry_point=entry_point, source_dir=base_dir, ) step_second_best_model = CreateModelStep( name="2nd-best-model", model=second_best_model, inputs=model_inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[ instance_count, instance_type, min_batch_size, max_batch_size ], steps=[step_tune, step_best_model, step_second_best_model], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 3 for step in execution_steps: assert step["StepStatus"] == "Succeeded" break finally: try: pipeline.delete() except Exception: pass
def test_estimator_transformer_with_model_repack_with_estimator(estimator): model_data = f"s3://{BUCKET}/model.tar.gz" model_inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) service_fault_retry_policy = StepRetryPolicy( exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10 ) transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") estimator_transformer = EstimatorTransformer( name="EstimatorTransformerStep", estimator=estimator, model_data=model_data, model_inputs=model_inputs, instance_count=1, instance_type="ml.c4.4xlarge", transform_inputs=transform_inputs, depends_on=["TestStep"], model_step_retry_policies=[service_fault_retry_policy], transform_step_retry_policies=[service_fault_retry_policy], repack_model_step_retry_policies=[service_fault_retry_policy], entry_point=f"{DATA_DIR}/dummy_script.py", ) request_dicts = estimator_transformer.request_dicts() assert len(request_dicts) == 3 for request_dict in request_dicts: if request_dict["Type"] == "Training": assert request_dict["Name"] == "EstimatorTransformerStepRepackModel" assert request_dict["DependsOn"] == ["TestStep"] assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] # pop out the dynamic generated fields arguments["HyperParameters"].pop("sagemaker_submit_directory") assert arguments == { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/" + "sagemaker-scikit-learn:0.23-1-cpu-py3", }, "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"}, "StoppingCondition": {"MaxRuntimeInSeconds": 86400}, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 30, }, "RoleArn": "DummyRole", "InputDataConfig": [ { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://my-bucket/model.tar.gz", "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", } ], "HyperParameters": { "inference_script": '"dummy_script.py"', "model_archive": '"s3://my-bucket/model.tar.gz"', "dependencies": "null", "source_dir": "null", "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", "sagemaker_region": '"us-west-2"', }, "VpcConfig": {"Subnets": ["abc", "def"], "SecurityGroupIds": ["123", "456"]}, "DebugHookConfig": { "S3OutputPath": "s3://my-bucket/", "CollectionConfigurations": [], }, } elif request_dict["Type"] == "Model": assert request_dict["Name"] == "EstimatorTransformerStepCreateModelStep" assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] assert isinstance(arguments["PrimaryContainer"]["ModelDataUrl"], Properties) arguments["PrimaryContainer"].pop("ModelDataUrl") assert "DependsOn" not in request_dict assert arguments == { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage", }, } elif request_dict["Type"] == "Transform": assert request_dict["Name"] == "EstimatorTransformerStepTransformStep" assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] assert isinstance(arguments["ModelName"], Properties) arguments.pop("ModelName") assert "DependsOn" not in request_dict assert arguments == { "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/transform_manifest", } } }, "TransformOutput": {"S3OutputPath": None}, "TransformResources": {"InstanceCount": 1, "InstanceType": "ml.c4.4xlarge"}, } else: raise Exception("A step exists in the collection of an invalid type.")