def test_pipeline_start(sagemaker_session_mock): sagemaker_session_mock.sagemaker_client.start_pipeline_execution.return_value = { "PipelineExecutionArn": "my:arn" } pipeline = Pipeline( name="MyPipeline", parameters=[ ParameterString("alpha", "beta"), ParameterString("gamma", "delta") ], steps=[], sagemaker_session=sagemaker_session_mock, ) pipeline.start() assert sagemaker_session_mock.start_pipeline_execution.called_with( PipelineName="MyPipeline", ) pipeline.start(execution_display_name="pipeline-execution") assert sagemaker_session_mock.start_pipeline_execution.called_with( PipelineName="MyPipeline", PipelineExecutionDisplayName="pipeline-execution") pipeline.start(parameters=dict(alpha="epsilon")) assert sagemaker_session_mock.start_pipeline_execution.called_with( PipelineName="MyPipeline", PipelineParameters=[{ "Name": "alpha", "Value": "epsilon" }])
def test_pipeline_execution_basics(sagemaker_session_mock): sagemaker_session_mock.sagemaker_client.start_pipeline_execution.return_value = { "PipelineExecutionArn": "my:arn" } sagemaker_session_mock.sagemaker_client.list_pipeline_execution_steps.return_value = { "PipelineExecutionSteps": [Mock()] } pipeline = Pipeline( name="MyPipeline", parameters=[ParameterString("alpha", "beta"), ParameterString("gamma", "delta")], steps=[], sagemaker_session=sagemaker_session_mock, ) execution = pipeline.start() execution.stop() assert sagemaker_session_mock.sagemaker_client.stop_pipeline_execution.called_with( PipelineExecutionArn="my:arn" ) execution.describe() assert sagemaker_session_mock.sagemaker_client.describe_pipeline_execution.called_with( PipelineExecutionArn="my:arn" ) steps = execution.list_steps() assert sagemaker_session_mock.sagemaker_client.describe_pipeline_execution_steps.called_with( PipelineExecutionArn="my:arn" ) assert len(steps) == 1
def test_processing_step(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest" ) instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": {"ImageUri": "fakeimage"}, "ProcessingInputs": [ { "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": processing_input_data_uri_parameter, }, } ], "ProcessingResources": { "ClusterConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_parameter_to_string_and_string_implicit_value(): param = ParameterString("MyString", "1") assert param.to_string() == param with pytest.raises(TypeError) as error: str(param) assert str(error.value) == "Pipeline variables do not support __str__ operation."
def test_pipeline_start_before_creation(sagemaker_session_mock): sagemaker_session_mock.sagemaker_client.describe_pipeline.side_effect = ClientError({}, "bar") pipeline = Pipeline( name="MyPipeline", parameters=[ParameterString("alpha", "beta"), ParameterString("gamma", "delta")], steps=[], sagemaker_session=sagemaker_session_mock, ) with pytest.raises(ValueError): pipeline.start()
def test_add_depends_on(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest") instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step_1 = ProcessingStep( name="MyProcessingStep-1", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_2 = ProcessingStep( name="MyProcessingStep-2", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3 = ProcessingStep( name="MyProcessingStep-3", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3.add_depends_on([step_2.name]) assert "DependsOn" not in step_1.to_request() assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"] assert step_3.to_request()["DependsOn"] == [ "MyProcessingStep-1", "MyProcessingStep-2" ]
def get_pipeline(region, role, image_uri, model_path): session = get_session(region) if role is None: role = sagemaker.session.get_execution_role(session) train_data_param = ParameterString(name='train-data') validation_data_param = ParameterString(name='validation-data') image_uri_param = ParameterString(name='image-uri') model_path_param = ParameterString(name='model-path') estimator = Estimator(image_uri=image_uri, instance_type='ml.m5.xlarge', instance_count=1, output_path=model_path, sagemaker_session=session, role=role) ### Your Pipeline definition goes here .... ########################################### step_train = TrainingStep(name="iris-model-train", estimator=estimator, inputs={ "train": TrainingInput(s3_data=train_data_param, content_type='text/csv'), "validation": TrainingInput(s3_data=validation_data_param, content_type='text/csv') }) step_register = RegisterModel( name='iris-model-register', model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, estimator=estimator, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name='iris-model') pipeline = Pipeline(name='iris-model-pipeline', steps=[step_train, step_register], parameters=[train_data_param, validation_data_param], sagemaker_session=session) ### end of Pipeline definition ########################################### return pipeline
def test_condition_or(): var = ExecutionVariables.START_DATETIME cond = ConditionGreaterThan(left=var, right="2020-12-01") param = ParameterString(name="MyStr") cond_in = ConditionIn(value=param, in_values=["abc", "def"]) cond_or = ConditionOr(conditions=[cond, cond_in]) assert cond_or.to_request() == { "Type": "Or", "Conditions": [ { "Type": "GreaterThan", "LeftValue": { "Get": "Execution.StartDateTime" }, "RightValue": "2020-12-01", }, { "Type": "In", "QueryValue": { "Get": "Parameters.MyStr" }, "Values": ["abc", "def"], }, ], }
def test_join_expressions(): assert Join( values=[ "foo", ParameterFloat(name="MyFloat"), ParameterInteger(name="MyInt"), ParameterString(name="MyStr"), Properties(path="Steps.foo.OutputPath.S3Uri"), ExecutionVariables.PIPELINE_EXECUTION_ID, Join(on=",", values=[1, "a", False, 1.1]), ] ).expr == { "Std:Join": { "On": "", "Values": [ "foo", {"Get": "Parameters.MyFloat"}, {"Get": "Parameters.MyInt"}, {"Get": "Parameters.MyStr"}, {"Get": "Steps.foo.OutputPath.S3Uri"}, {"Get": "Execution.PipelineExecutionId"}, {"Std:Join": {"On": ",", "Values": [1, "a", False, 1.1]}}, ], }, }
def test_pipeline_interpolates_lambda_outputs(sagemaker_session): parameter = ParameterString("MyStr") output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.String) lambda_step1 = LambdaStep( name="MyLambdaStep1", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), inputs={"arg1": "foo"}, outputs=[output_param1], ) lambda_step2 = LambdaStep( name="MyLambdaStep2", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), inputs={"arg1": output_param1}, outputs=[output_param2], ) pipeline = Pipeline( name="MyPipeline", parameters=[parameter], steps=[lambda_step1, lambda_step2], sagemaker_session=sagemaker_session, ) assert json.loads(pipeline.definition()) == { "Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "MyStr", "Type": "String"}], "PipelineExperimentConfig": { "ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}, }, "Steps": [ { "Name": "MyLambdaStep1", "Type": "Lambda", "Arguments": {"arg1": "foo"}, "DependsOn": ["TestStep"], "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", "OutputParameters": [{"OutputName": "output1", "OutputType": "String"}], }, { "Name": "MyLambdaStep2", "Type": "Lambda", "Arguments": {"arg1": {"Get": "Steps.MyLambdaStep1.OutputParameters['output1']"}}, "DependsOn": ["TestStep"], "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", "OutputParameters": [{"OutputName": "output2", "OutputType": "String"}], }, ], }
def test_condition_in(): param = ParameterString(name="MyStr") cond_in = ConditionIn(value=param, in_values=["abc", "def"]) assert cond_in.to_request() == { "Type": "In", "QueryValue": { "Get": "Parameters.MyStr" }, "Values": ["abc", "def"], }
def get_pipeline( region, default_bucket='sagemaker-us-east-1-376337229415', model_package_group_name="AbaloneExample", pipeline_name="AbaloneExample", base_job_prefix="abalone", ): """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data. Args: region: AWS region to create and run the pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) role = sagemaker.session.get_execution_role(sagemaker_session) input_data = ParameterString( name="InputDataUrl", default_value='s3://{}/abalone/abalone-dataset.csv'.format(default_bucket), ) step_process = processing_job(processing_instance_type, processing_instance_count, sagemaker_session, role) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/Abalonetrain" image_uri = get_image_uri(framework="xgboost", region=region, version="1.0-1", py_version="py3", training_instance_type=training_instance_type) step_train = training_job(image_uri, training_instance_type, training_instance_count, model_path, sagemaker_session, role) # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train], sagemaker_session=sagemaker_session, ) return pipeline
def test_pipeline_basic(): parameter = ParameterString("MyStr") pipeline = Pipeline( name="MyPipeline", parameters=[parameter], steps=[CustomStep(name="MyStep", input_data=parameter)], sagemaker_session=sagemaker_session_mock, ) assert pipeline.to_request() == { "Version": "2020-12-01", "Metadata": {}, "Parameters": [{ "Name": "MyStr", "Type": "String" }], "PipelineExperimentConfig": { "ExperimentName": ExecutionVariables.PIPELINE_NAME, "TrialName": ExecutionVariables.PIPELINE_EXECUTION_ID, }, "Steps": [{ "Name": "MyStep", "Type": "Training", "Arguments": { "input_data": parameter } }], } assert ordered(json.loads(pipeline.definition())) == ordered({ "Version": "2020-12-01", "Metadata": {}, "Parameters": [{ "Name": "MyStr", "Type": "String" }], "PipelineExperimentConfig": { "ExperimentName": { "Get": "Execution.PipelineName" }, "TrialName": { "Get": "Execution.PipelineExecutionId" }, }, "Steps": [{ "Name": "MyStep", "Type": "Training", "Arguments": { "input_data": { "Get": "Parameters.MyStr" } }, }], })
def test_pipeline_interpolates_callback_outputs(): parameter = ParameterString("MyStr") outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) outputParam2 = CallbackOutput(output_name="output2", output_type=CallbackOutputTypeEnum.String) cb_step1 = CallbackStep( name="MyCallbackStep1", depends_on=["TestStep"], sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={"arg1": "foo"}, outputs=[outputParam1], ) cb_step2 = CallbackStep( name="MyCallbackStep2", depends_on=["TestStep"], sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={"arg1": outputParam1}, outputs=[outputParam2], ) pipeline = Pipeline( name="MyPipeline", parameters=[parameter], steps=[cb_step1, cb_step2], sagemaker_session=sagemaker_session_mock, ) assert json.loads(pipeline.definition()) == { "Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "MyStr", "Type": "String"}], "PipelineExperimentConfig": { "ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}, }, "Steps": [ { "Name": "MyCallbackStep1", "Type": "Callback", "Arguments": {"arg1": "foo"}, "DependsOn": ["TestStep"], "SqsQueueUrl": "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", "OutputParameters": [{"OutputName": "output1", "OutputType": "String"}], }, { "Name": "MyCallbackStep2", "Type": "Callback", "Arguments": {"arg1": {"Get": "Steps.MyCallbackStep1.OutputParameters['output1']"}}, "DependsOn": ["TestStep"], "SqsQueueUrl": "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", "OutputParameters": [{"OutputName": "output2", "OutputType": "String"}], }, ], }
def test_condition_less_than_or_equal_to(): var = ExecutionVariables.START_DATETIME param = ParameterString(name="StartDateTime") cond = ConditionLessThanOrEqualTo(left=var, right=param) assert cond.to_request() == { "Type": "LessThanOrEqualTo", "LeftValue": { "Get": "Execution.StartDateTime" }, "RightValue": { "Get": "Parameters.StartDateTime" }, }
def test_condition_not_in(): param = ParameterString(name="MyStr") cond_in = ConditionIn(value=param, in_values=["abc", "def"]) cond_not = ConditionNot(expression=cond_in) assert cond_not.to_request() == { "Type": "Not", "Expression": { "Type": "In", "QueryValue": { "Get": "Parameters.MyStr" }, "Values": ["abc", "def"], }, }
def test_condition_not(): param = ParameterString(name="MyStr") cond_eq = ConditionEquals(left=param, right="foo") cond_not = ConditionNot(expression=cond_eq) assert cond_not.to_request() == { "Type": "Not", "Expression": { "Type": "Equals", "LeftValue": { "Get": "Parameters.MyStr" }, "RightValue": "foo", }, }
def test_transform_step_with_transformer(pipeline_session): model_name = ParameterString("ModelName") transformer = Transformer( model_name=model_name, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{pipeline_session.default_bucket()}/Transform", sagemaker_session=pipeline_session, ) transform_inputs = TransformInput( data=f"s3://{pipeline_session.default_bucket()}/batch-data", ) with warnings.catch_warnings(record=True) as w: step_args = transformer.transform( data=transform_inputs.data, data_type=transform_inputs.data_type, content_type=transform_inputs.content_type, compression_type=transform_inputs.compression_type, split_type=transform_inputs.split_type, input_filter=transform_inputs.input_filter, output_filter=transform_inputs.output_filter, join_source=transform_inputs.join_source, model_client_config=transform_inputs.model_client_config, ) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: step = TransformStep( name="MyTransformStep", step_args=step_args, ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], parameters=[model_name], sagemaker_session=pipeline_session, ) step_args["ModelName"] = model_name.expr assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTransformStep", "Type": "Transform", "Arguments": step_args, }
def test_condition_in_mixed(): param = ParameterString(name="MyStr") prop = Properties("foo") var = ExecutionVariables.START_DATETIME cond_in = ConditionIn(value=param, in_values=["abc", prop, var]) assert cond_in.to_request() == { "Type": "In", "QueryValue": { "Get": "Parameters.MyStr" }, "Values": ["abc", { "Get": "foo" }, { "Get": "Execution.StartDateTime" }], }
def test_json_get_expressions_with_invalid_step_name(): with pytest.raises(ValueError) as err: JsonGet( step_name="", property_file="my-property-file", json_path="my-json-path", ).expr assert "Please give a valid step name as a string" in str(err.value) with pytest.raises(ValueError) as err: JsonGet( step_name=ParameterString(name="MyString"), property_file="my-property-file", json_path="my-json-path", ).expr assert "Please give a valid step name as a string" in str(err.value)
def test_pipeline_basic(): parameter = ParameterString("MyStr") pipeline = Pipeline( name="MyPipeline", parameters=[parameter], steps=[CustomStep(name="MyStep", input_data=parameter)], sagemaker_session=sagemaker_session_mock, ) assert pipeline.to_request() == { "Version": "2020-12-01", "Metadata": {}, "Parameters": [{ "Name": "MyStr", "Type": "String" }], "Steps": [{ "Name": "MyStep", "Type": "Training", "Arguments": { "input_data": parameter } }], } assert ordered(json.loads(pipeline.definition())) == ordered({ "Version": "2020-12-01", "Metadata": {}, "Parameters": [{ "Name": "MyStr", "Type": "String" }], "Steps": [{ "Name": "MyStep", "Type": "Training", "Arguments": { "input_data": { "Get": "Parameters.MyStr" } }, }], })
def test_data_quality_check_step( sagemaker_session, check_job_config, model_package_group_name, supplied_baseline_statistics_uri, supplied_baseline_constraints_uri, ): data_quality_check_config = DataQualityCheckConfig( baseline_dataset=ParameterString(name="BaselineDataset"), dataset_format=DatasetFormat.csv(header=True), output_s3_uri="s3://...", record_preprocessor_script= "s3://my_bucket/data_quality/preprocessor.py", post_analytics_processor_script= "s3://my_bucket/data_quality/postprocessor.py", ) data_quality_check_step = QualityCheckStep( name="DataQualityCheckStep", skip_check=False, register_new_baseline=False, quality_check_config=data_quality_check_config, check_job_config=check_job_config, model_package_group_name=model_package_group_name, supplied_baseline_statistics=supplied_baseline_statistics_uri, supplied_baseline_constraints=supplied_baseline_constraints_uri, cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"), ) pipeline = Pipeline( name="MyPipeline", parameters=[ supplied_baseline_statistics_uri, supplied_baseline_constraints_uri, model_package_group_name, ], steps=[data_quality_check_step], sagemaker_session=sagemaker_session, ) step_definition = _get_step_definition_for_test( pipeline, ["baseline_dataset_input", "quality_check_output"]) assert step_definition == _expected_data_quality_dsl
def test_parameter_string_with_enum_values(): param = ParameterString("MyString", enum_values=["a", "b"]) assert param.to_request() == {"Name": "MyString", "Type": "String", "EnumValues": ["a", "b"]} assert param.expr == {"Get": "Parameters.MyString"} assert param.parameter_type.python_type == str param = ParameterString("MyString", default_value="a", enum_values=["a", "b"]) assert param.to_request() == { "Name": "MyString", "Type": "String", "DefaultValue": "a", "EnumValues": ["a", "b"], } assert param.expr == {"Get": "Parameters.MyString"} assert param.parameter_type.python_type == str
def test_add_func(): param_str = ParameterString(name="MyString", default_value="s3://foo/bar/baz.csv") param_int = ParameterInteger(name="MyInteger", default_value=3) param_float = ParameterFloat(name="MyFloat", default_value=1.5) param_bool = ParameterBoolean(name="MyBool") with pytest.raises(TypeError) as error: param_str + param_int assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_int + param_float assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_float + param_bool assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_bool + param_str assert str(error.value) == "Pipeline variables do not support concatenation."
def test_parameter_string_with_enum_values(): param = ParameterString("MyString", enum_values=["a", "b"]) assert param.to_request() == { "Name": "MyString", "Type": "String", "EnumValues": ["a", "b"] } param = ParameterString("MyString", default_value="a", enum_values=["a", "b"]) assert param.to_request() == { "Name": "MyString", "Type": "String", "DefaultValue": "a", "EnumValues": ["a", "b"], }
def test_large_pipeline_update(sagemaker_session_mock, role_arn): parameter = ParameterString("MyStr") pipeline = Pipeline( name="MyPipeline", parameters=[parameter], steps=[CustomStep(name="MyStep", input_data=parameter)] * 2000, sagemaker_session=sagemaker_session_mock, ) s3.S3Uploader.upload_string_as_file_body = Mock() pipeline.create(role_arn=role_arn) assert s3.S3Uploader.upload_string_as_file_body.called_with( body=pipeline.definition(), s3_uri="s3://s3_bucket/MyPipeline" ) assert sagemaker_session_mock.sagemaker_client.update_pipeline.called_with( PipelineName="MyPipeline", PipelineDefinitionS3Location={"Bucket": "s3_bucket", "ObjectKey": "MyPipeline"}, RoleArn=role_arn, )
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="TestPackageGroup", pipeline_name="TestPipeline", base_job_prefix="Test", ): """Gets a SageMaker ML Pipeline instance working with on abalone data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval") input_data = ParameterString( name="InputDataUrl", default_value= f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv", ) # processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-test-preprocess", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="PreprocessTestData", processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain" image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/test-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="TrainTestModel", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-test-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="TestEvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="EvaluateTestModel", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # register model step that will be conditionally executed model_metrics = ModelMetrics( model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json")) step_register = RegisterModel( name="RegisterTestModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # condition step for evaluating model quality and branching execution cond_lte = ConditionLessThanOrEqualTo( left=JsonGet(step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mse.value"), right=6.0, ) step_cond = ConditionStep( name="CheckMSETestEvaluation", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
def test_parameter_string_implicit_value(): param = ParameterString("MyString") assert param.__str__() == "" param1 = ParameterString("MyString", "1") assert param1.__str__() == "1" param2 = ParameterString("MyString", default_value="2") assert param2.__str__() == "2" param3 = ParameterString(name="MyString", default_value="3") assert param3.__str__() == "3" param3 = ParameterString(name="MyString", default_value="3", enum_values=["3"]) assert param3.__str__() == "3"
def test_parsable_parameter_string(): param = ParameterString("MyString", default_value="s3://foo/bar/baz.csv") assert urlparse(param).scheme == "s3"
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow") input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv") output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default" output_content_type = "CSV" output_config = {output_name: {"content_type": output_content_type}} job_argument = [f"--output-config '{json.dumps(output_config)}'"] inputs = [ ProcessingInput( input_name="dummy_data.csv", source=input_file_path, destination="/opt/ml/processing/dummy_data.csv", ) ] output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output" outputs = [ ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=output_s3_uri, s3_upload_mode="EndOfJob", ) ] data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=recipe_file_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep( name="data-wrangler-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs, job_arguments=job_argument, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) expected_image_uri = image_uris.retrieve( "data-wrangler", region=sagemaker_session.boto_region_name) assert len(definition["Steps"]) == 1 assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] is not None assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] == expected_image_uri assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"] assert len(processing_inputs) == 2 for processing_input in processing_inputs: if processing_input["InputName"] == "flow": assert processing_input["S3Input"]["S3Uri"].endswith(".flow") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/flow" elif processing_input["InputName"] == "dummy_data.csv": assert processing_input["S3Input"]["S3Uri"].endswith(".csv") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/dummy_data.csv" else: raise AssertionError("Unknown input name") assert definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"] is not None processing_outputs = definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"]["Outputs"] assert len(processing_outputs) == 1 assert processing_outputs[0]["OutputName"] == output_name assert processing_outputs[0]["S3Output"] is not None assert processing_outputs[0]["S3Output"][ "LocalPath"] == "/opt/ml/processing/output" assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "data-wrangler-step" finally: try: pipeline.delete() except Exception: pass