def test_parameter_with_default_value_zero(): param = ParameterInteger(name="MyInteger", default_value=0) assert param.to_request() == { "Name": "MyInteger", "Type": "Integer", "DefaultValue": 0 }
def test_condition_equals_parameter(): param1 = ParameterInteger(name="MyInt1") param2 = ParameterInteger(name="MyInt2") cond = ConditionEquals(left=param1, right=param2) assert cond.to_request() == { "Type": "Equals", "LeftValue": { "Get": "Parameters.MyInt1" }, "RightValue": { "Get": "Parameters.MyInt2" }, }
def test_create_and_update_with_parallelism_config( sagemaker_session, role, pipeline_name, region_name ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) callback_steps = [ CallbackStep( name=f"callback-step{count}", sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={"arg1": "foo"}, outputs=[outputParam], ) for count in range(500) ] pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=callback_steps, sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role, parallelism_config={"MaxParallelExecutionSteps": 50}) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) response = pipeline.describe() assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 50 pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] response = pipeline.update(role, parallelism_config={"MaxParallelExecutionSteps": 55}) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) response = pipeline.describe() assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 55 finally: try: pipeline.delete() except Exception: pass
def test_processing_step(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest" ) instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": {"ImageUri": "fakeimage"}, "ProcessingInputs": [ { "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": processing_input_data_uri_parameter, }, } ], "ProcessingResources": { "ClusterConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_lambda_step(sagemaker_session): param = ParameterInteger(name="MyInt") outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) outputParam2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean) lambda_step = LambdaStep( name="MyLambdaStep", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), inputs={"arg1": "foo", "arg2": 5, "arg3": param}, outputs=[outputParam1, outputParam2], ) lambda_step.add_depends_on(["SecondTestStep"]) assert lambda_step.to_request() == { "Name": "MyLambdaStep", "Type": "Lambda", "DependsOn": ["TestStep", "SecondTestStep"], "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", "OutputParameters": [ {"OutputName": "output1", "OutputType": "String"}, {"OutputName": "output2", "OutputType": "Boolean"}, ], "Arguments": {"arg1": "foo", "arg2": 5, "arg3": param}, }
def test_join_expressions(): assert Join( values=[ "foo", ParameterFloat(name="MyFloat"), ParameterInteger(name="MyInt"), ParameterString(name="MyStr"), Properties(path="Steps.foo.OutputPath.S3Uri"), ExecutionVariables.PIPELINE_EXECUTION_ID, Join(on=",", values=[1, "a", False, 1.1]), ] ).expr == { "Std:Join": { "On": "", "Values": [ "foo", {"Get": "Parameters.MyFloat"}, {"Get": "Parameters.MyInt"}, {"Get": "Parameters.MyStr"}, {"Get": "Steps.foo.OutputPath.S3Uri"}, {"Get": "Execution.PipelineExecutionId"}, {"Std:Join": {"On": ",", "Values": [1, "a", False, 1.1]}}, ], }, }
def test_callback_step_output_expr(): param = ParameterInteger(name="MyInt") outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) outputParam2 = CallbackOutput(output_name="output2", output_type=CallbackOutputTypeEnum.Boolean) cb_step = CallbackStep( name="MyCallbackStep", depends_on=["TestStep"], sqs_queue_url= "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={ "arg1": "foo", "arg2": 5, "arg3": param }, outputs=[outputParam1, outputParam2], ) assert cb_step.properties.Outputs["output1"].expr == { "Get": "Steps.MyCallbackStep.OutputParameters['output1']" } assert cb_step.properties.Outputs["output2"].expr == { "Get": "Steps.MyCallbackStep.OutputParameters['output2']" }
def test_fail_step_with_join_fn_in_error_message(): param = ParameterInteger(name="MyInt", default_value=2) cond = ConditionEquals(left=param, right=1) step_cond = ConditionStep( name="CondStep", conditions=[cond], if_steps=[], else_steps=[], ) step_fail = FailStep( name="FailStep", error_message=Join(on=": ", values=[ "Failed due to xxx == yyy returns", step_cond.properties.Outcome ]), ) pipeline = Pipeline( name="MyPipeline", steps=[step_cond, step_fail], parameters=[param], ) _expected_dsl = [ { "Name": "CondStep", "Type": "Condition", "Arguments": { "Conditions": [{ "Type": "Equals", "LeftValue": { "Get": "Parameters.MyInt" }, "RightValue": 1 }], "IfSteps": [], "ElseSteps": [], }, }, { "Name": "FailStep", "Type": "Fail", "Arguments": { "ErrorMessage": { "Std:Join": { "On": ": ", "Values": [ "Failed due to xxx == yyy returns", { "Get": "Steps.CondStep.Outcome" }, ], } } }, }, ] assert json.loads(pipeline.definition())["Steps"] == _expected_dsl
def test_invalid_pipeline_depended_on_fail_step(sagemaker_session, role, pipeline_name): param = ParameterInteger(name="MyInt", default_value=2) cond = ConditionEquals(left=param, right=1) step_fail = FailStep( name="FailStep", error_message="Failed pipeline execution", ) step_cond = ConditionStep( name="CondStep", conditions=[cond], if_steps=[], else_steps=[], depends_on=["FailStep"], ) pipeline = Pipeline( name=pipeline_name, steps=[step_cond, step_fail], sagemaker_session=sagemaker_session, parameters=[param], ) try: with pytest.raises(Exception) as error: pipeline.create(role) assert "CondStep can not depends on FailStep" in str(error.value) finally: try: pipeline.delete() except Exception: pass
def test_two_step_fail_pipeline_with_str_err_msg(sagemaker_session, role, pipeline_name): param = ParameterInteger(name="MyInt", default_value=2) cond = ConditionEquals(left=param, right=1) step_fail = FailStep( name="FailStep", error_message="Failed due to hitting in else branch", ) step_cond = ConditionStep( name="CondStep", conditions=[cond], if_steps=[], else_steps=[step_fail], ) pipeline = Pipeline( name=pipeline_name, steps=[step_cond], sagemaker_session=sagemaker_session, parameters=[param], ) try: response = pipeline.create(role) pipeline_arn = response["PipelineArn"] execution = pipeline.start(parameters={}) response = execution.describe() assert response["PipelineArn"] == pipeline_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 2 for execution_step in execution_steps: if execution_step["StepName"] == "CondStep": assert execution_step["StepStatus"] == "Succeeded" continue assert execution_step["StepName"] == "FailStep" assert execution_step["StepStatus"] == "Failed" assert execution_step[ "FailureReason"] == "Failed due to hitting in else branch" metadata = execution_steps[0]["Metadata"]["Fail"] assert metadata[ "ErrorMessage"] == "Failed due to hitting in else branch" # Check FailureReason field in ListPipelineExecutions executions = sagemaker_session.sagemaker_client.list_pipeline_executions( PipelineName=pipeline.name)["PipelineExecutionSummaries"] assert len(executions) == 1 assert executions[0]["PipelineExecutionStatus"] == "Failed" assert ("Step failure: One or multiple steps failed" in executions[0]["PipelineExecutionFailureReason"]) finally: try: pipeline.delete() except Exception: pass
def test_parameter_integer_implicit_value(): param = ParameterInteger("MyInteger", 1) with pytest.raises(TypeError) as error: int(param) assert str(error.value) == "Pipeline variables do not support __int__ operation."
def test_one_step_lambda_pipeline(sagemaker_session, role, pipeline_name, region_name): instance_count = ParameterInteger(name="InstanceCount", default_value=2) outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) step_lambda = LambdaStep( name="lambda-step", lambda_func=Lambda( function_arn= ("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda" ), session=sagemaker_session, ), inputs={"arg1": "foo"}, outputs=[outputParam1], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_lambda], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_add_func_of_join(): func_join1 = Join(values=[1, "a"]) param = ParameterInteger(name="MyInteger", default_value=3) with pytest.raises(TypeError) as error: func_join1 + param assert str(error.value) == "Pipeline variables do not support concatenation."
def test_condition_equals(): param = ParameterInteger(name="MyInt") cond = ConditionEquals(left=param, right=1) assert cond.to_request() == { "Type": "Equals", "LeftValue": { "Get": "Parameters.MyInt" }, "RightValue": 1, }
def test_large_pipeline(sagemaker_session, role, pipeline_name, region_name): instance_count = ParameterInteger(name="InstanceCount", default_value=2) outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) callback_steps = [ CallbackStep( name=f"callback-step{count}", sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={"arg1": "foo"}, outputs=[outputParam], ) for count in range(2000) ] pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=callback_steps, sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) response = pipeline.describe() assert len(json.loads(pipeline.describe()["PipelineDefinition"])["Steps"]) == 2000 pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_add_depends_on(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest") instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step_1 = ProcessingStep( name="MyProcessingStep-1", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_2 = ProcessingStep( name="MyProcessingStep-2", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3 = ProcessingStep( name="MyProcessingStep-3", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3.add_depends_on([step_2.name]) assert "DependsOn" not in step_1.to_request() assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"] assert step_3.to_request()["DependsOn"] == [ "MyProcessingStep-1", "MyProcessingStep-2" ]
def test_two_steps_emr_pipeline(sagemaker_session, role, pipeline_name, region_name): instance_count = ParameterInteger(name="InstanceCount", default_value=2) emr_step_config = EMRStepConfig( jar= "s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar", args=["dummy_emr_script_path"], ) step_emr_1 = EMRStep( name="emr-step-1", cluster_id="j-1YONHTCP3YZKC", display_name="emr_step_1", description="MyEMRStepDescription", step_config=emr_step_config, ) step_emr_2 = EMRStep( name="emr-step-2", cluster_id=step_emr_1.properties.ClusterId, display_name="emr_step_2", description="MyEMRStepDescription", step_config=emr_step_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_emr_1, step_emr_2], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_condition_step(): param = ParameterInteger(name="MyInt") cond = ConditionEquals(left=param, right=1) step1 = CustomStep("MyStep1") step2 = CustomStep("MyStep2") cond_step = ConditionStep( name="MyConditionStep", depends_on=["TestStep"], conditions=[cond], if_steps=[step1], else_steps=[step2], ) cond_step.add_depends_on(["SecondTestStep"]) assert cond_step.to_request() == { "Name": "MyConditionStep", "Type": "Condition", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": { "Conditions": [ { "Type": "Equals", "LeftValue": { "Get": "Parameters.MyInt" }, "RightValue": 1, }, ], "IfSteps": [ { "Name": "MyStep1", "Type": "Training", "Arguments": {}, }, ], "ElseSteps": [{ "Name": "MyStep2", "Type": "Training", "Arguments": {}, }], }, } assert cond_step.properties.Outcome.expr == { "Get": "Steps.MyConditionStep.Outcome" }
def test_callback_step(): param = ParameterInteger(name="MyInt") outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) outputParam2 = CallbackOutput(output_name="output2", output_type=CallbackOutputTypeEnum.Boolean) cb_step = CallbackStep( name="MyCallbackStep", depends_on=["TestStep"], sqs_queue_url= "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", inputs={ "arg1": "foo", "arg2": 5, "arg3": param }, outputs=[outputParam1, outputParam2], ) cb_step.add_depends_on(["SecondTestStep"]) assert cb_step.to_request() == { "Name": "MyCallbackStep", "Type": "Callback", "DependsOn": ["TestStep", "SecondTestStep"], "SqsQueueUrl": "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", "OutputParameters": [ { "OutputName": "output1", "OutputType": "String" }, { "OutputName": "output2", "OutputType": "Boolean" }, ], "Arguments": { "arg1": "foo", "arg2": 5, "arg3": param }, }
def test_add_func(): param_str = ParameterString(name="MyString", default_value="s3://foo/bar/baz.csv") param_int = ParameterInteger(name="MyInteger", default_value=3) param_float = ParameterFloat(name="MyFloat", default_value=1.5) param_bool = ParameterBoolean(name="MyBool") with pytest.raises(TypeError) as error: param_str + param_int assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_int + param_float assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_float + param_bool assert str(error.value) == "Pipeline variables do not support concatenation." with pytest.raises(TypeError) as error: param_bool + param_str assert str(error.value) == "Pipeline variables do not support concatenation."
def test_lambda_step_output_expr(sagemaker_session): param = ParameterInteger(name="MyInt") outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) outputParam2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean) lambda_step = LambdaStep( name="MyLambdaStep", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), inputs={"arg1": "foo", "arg2": 5, "arg3": param}, outputs=[outputParam1, outputParam2], ) assert lambda_step.properties.Outputs["output1"].expr == { "Get": "Steps.MyLambdaStep.OutputParameters['output1']" } assert lambda_step.properties.Outputs["output2"].expr == { "Get": "Steps.MyLambdaStep.OutputParameters['output2']" }
def test_lambda_step(sagemaker_session): param = ParameterInteger(name="MyInt") output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") lambda_step = LambdaStep( name="MyLambdaStep", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), display_name="MyLambdaStep", description="MyLambdaStepDescription", inputs={"arg1": "foo", "arg2": 5, "arg3": param}, outputs=[output_param1, output_param2], cache_config=cache_config, ) lambda_step.add_depends_on(["SecondTestStep"]) pipeline = Pipeline( name="MyPipeline", parameters=[param], steps=[lambda_step], sagemaker_session=sagemaker_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyLambdaStep", "Type": "Lambda", "DependsOn": ["TestStep", "SecondTestStep"], "DisplayName": "MyLambdaStep", "Description": "MyLambdaStepDescription", "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", "OutputParameters": [ {"OutputName": "output1", "OutputType": "String"}, {"OutputName": "output2", "OutputType": "Boolean"}, ], "Arguments": {"arg1": "foo", "arg2": 5, "arg3": {"Get": "Parameters.MyInt"}}, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, }
def test_parameter_integer_implicit_value(): param = ParameterInteger("MyInteger") assert param.__int__() == 0 param1 = ParameterInteger("MyInteger", 1) assert param1.__int__() == 1 param2 = ParameterInteger("MyInteger", default_value=2) assert param2.__int__() == 2 param3 = ParameterInteger(name="MyInteger", default_value=3) assert param3.__int__() == 3
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow") input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv") output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default" output_content_type = "CSV" output_config = {output_name: {"content_type": output_content_type}} job_argument = [f"--output-config '{json.dumps(output_config)}'"] inputs = [ ProcessingInput( input_name="dummy_data.csv", source=input_file_path, destination="/opt/ml/processing/dummy_data.csv", ) ] output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output" outputs = [ ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=output_s3_uri, s3_upload_mode="EndOfJob", ) ] data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=recipe_file_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep( name="data-wrangler-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs, job_arguments=job_argument, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) expected_image_uri = image_uris.retrieve( "data-wrangler", region=sagemaker_session.boto_region_name) assert len(definition["Steps"]) == 1 assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] is not None assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] == expected_image_uri assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"] assert len(processing_inputs) == 2 for processing_input in processing_inputs: if processing_input["InputName"] == "flow": assert processing_input["S3Input"]["S3Uri"].endswith(".flow") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/flow" elif processing_input["InputName"] == "dummy_data.csv": assert processing_input["S3Input"]["S3Uri"].endswith(".csv") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/dummy_data.csv" else: raise AssertionError("Unknown input name") assert definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"] is not None processing_outputs = definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"]["Outputs"] assert len(processing_outputs) == 1 assert processing_outputs[0]["OutputName"] == output_name assert processing_outputs[0]["S3Output"] is not None assert processing_outputs[0]["S3Output"][ "LocalPath"] == "/opt/ml/processing/output" assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "data-wrangler-step" finally: try: pipeline.delete() except Exception: pass
def test_two_processing_job_depends_on( sagemaker_session, role, pipeline_name, region_name, cpu_instance_type, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") pyspark_processor = PySparkProcessor( base_job_name="sm-spark", framework_version="2.4", role=role, instance_count=instance_count, instance_type=cpu_instance_type, max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) spark_run_args = pyspark_processor.get_run_args( submit_app=script_path, arguments=[ "--s3_input_bucket", sagemaker_session.default_bucket(), "--s3_input_key_prefix", "spark-input", "--s3_output_bucket", sagemaker_session.default_bucket(), "--s3_output_key_prefix", "spark-output", ], ) step_pyspark_1 = ProcessingStep( name="pyspark-process-1", processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, ) step_pyspark_2 = ProcessingStep( name="pyspark-process-2", depends_on=[step_pyspark_1], processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark_1, step_pyspark_2], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 2 time_stamp = {} for execution_step in execution_steps: name = execution_step["StepName"] if name == "pyspark-process-1": time_stamp[name] = execution_step["EndTime"] else: time_stamp[name] = execution_step["StartTime"] assert time_stamp["pyspark-process-1"] < time_stamp["pyspark-process-2"] finally: try: pipeline.delete() except Exception: pass
def test_one_step_sparkjar_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) cache_config = CacheConfig(enable_caching=True, expire_after="T30m") spark_path = os.path.join(DATA_DIR, "spark") spark_jar_processor = SparkJarProcessor( role=role, instance_count=2, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version="2.4", ) bucket = spark_jar_processor.sagemaker_session.default_bucket() with open(os.path.join(spark_path, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session, ) output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}" java_project_dir = os.path.join(spark_path, "code", "java", "hello-java-spark") spark_run_args = spark_jar_processor.get_run_args( submit_app=f"{java_project_dir}/hello-spark-java.jar", submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, ) step_pyspark = ProcessingStep( name="sparkjar-process", processor=spark_jar_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sparkjar-process" finally: try: pipeline.delete() except Exception: pass
def test_one_step_pyspark_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") cache_config = CacheConfig(enable_caching=True, expire_after="T30m") pyspark_processor = PySparkProcessor( base_job_name="sm-spark", framework_version="2.4", role=role, instance_count=instance_count, instance_type=cpu_instance_type, max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) spark_run_args = pyspark_processor.get_run_args( submit_app=script_path, arguments=[ "--s3_input_bucket", sagemaker_session.default_bucket(), "--s3_input_key_prefix", "spark-input", "--s3_output_bucket", sagemaker_session.default_bucket(), "--s3_output_key_prefix", "spark-output", ], ) step_pyspark = ProcessingStep( name="pyspark-process", processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "pyspark-process" finally: try: pipeline.delete() except Exception: pass
def test_one_step_framework_processing_pipeline( sagemaker_session, role, sklearn_latest_version, cpu_instance_type, pipeline_name, region_name, athena_dataset_definition, ): """Use `SKLearnProcessor` to test `FrameworkProcessor`.""" instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") inputs = [ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), ProcessingInput(dataset_definition=athena_dataset_definition), ] cache_config = CacheConfig(enable_caching=True, expire_after="T30m") sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role=role, instance_type=cpu_instance_type, instance_count=instance_count, sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs) step_sklearn = ProcessingStep( name="sklearn-process", processor=sklearn_processor, inputs=run_args.inputs, outputs=run_args.outputs, job_arguments=run_args.arguments, code=run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_sklearn], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sklearn-process" finally: try: pipeline.delete() except Exception: pass
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="CustomerChurnPackageGroup", # Choose any name pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="CustomerChurn", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString( name="ModelApprovalStatus", default_value= "PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value= f"s3://sm-pipelines-demo-data-123456789/churn.txt", # Change this to point to the s3 location of your raw input data. ) # Processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name= f"{base_job_prefix}/sklearn-CustomerChurn-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="CustomerChurnProcess", # choose any name processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain" image_uri = sagemaker.image_uris.retrieve( framework= "xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/CustomerChurn-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( objective="binary:logistic", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="CustomerChurnTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # Processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="CustomerChurnEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # Register model step that will be conditionally executed model_metrics = ModelMetrics(model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json", )) # Register model step that will be conditionally executed step_register = RegisterModel( name="CustomerChurnRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=step_eval, property_file=evaluation_report, json_path= "binary_classification_metrics.accuracy.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=0.8, # You can change the threshold here ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="TestPackageGroup", pipeline_name="TestPipeline", base_job_prefix="Test", ): """Gets a SageMaker ML Pipeline instance working with on abalone data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval") input_data = ParameterString( name="InputDataUrl", default_value= f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv", ) # processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-test-preprocess", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="PreprocessTestData", processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain" image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/test-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="TrainTestModel", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-test-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="TestEvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="EvaluateTestModel", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # register model step that will be conditionally executed model_metrics = ModelMetrics( model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json")) step_register = RegisterModel( name="RegisterTestModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # condition step for evaluating model quality and branching execution cond_lte = ConditionLessThanOrEqualTo( left=JsonGet(step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mse.value"), right=6.0, ) step_cond = ConditionStep( name="CheckMSETestEvaluation", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline