def test_parameter_with_default_value_zero():
    param = ParameterInteger(name="MyInteger", default_value=0)
    assert param.to_request() == {
        "Name": "MyInteger",
        "Type": "Integer",
        "DefaultValue": 0
    }
Example #2
0
def test_condition_equals_parameter():
    param1 = ParameterInteger(name="MyInt1")
    param2 = ParameterInteger(name="MyInt2")
    cond = ConditionEquals(left=param1, right=param2)
    assert cond.to_request() == {
        "Type": "Equals",
        "LeftValue": {
            "Get": "Parameters.MyInt1"
        },
        "RightValue": {
            "Get": "Parameters.MyInt2"
        },
    }
def test_create_and_update_with_parallelism_config(
    sagemaker_session, role, pipeline_name, region_name
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)

    outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String)

    callback_steps = [
        CallbackStep(
            name=f"callback-step{count}",
            sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
            inputs={"arg1": "foo"},
            outputs=[outputParam],
        )
        for count in range(500)
    ]
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=callback_steps,
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role, parallelism_config={"MaxParallelExecutionSteps": 50})
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )
        response = pipeline.describe()
        assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 50

        pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
        response = pipeline.update(role, parallelism_config={"MaxParallelExecutionSteps": 55})
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        response = pipeline.describe()
        assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 55

    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_processing_step(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest"
    )
    instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    assert step.to_request() == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": {
            "AppSpecification": {"ImageUri": "fakeimage"},
            "ProcessingInputs": [
                {
                    "InputName": "input-1",
                    "AppManaged": False,
                    "S3Input": {
                        "LocalPath": "processing_manifest",
                        "S3CompressionType": "None",
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3InputMode": "File",
                        "S3Uri": processing_input_data_uri_parameter,
                    },
                }
            ],
            "ProcessingResources": {
                "ClusterConfig": {
                    "InstanceCount": instance_count_parameter,
                    "InstanceType": instance_type_parameter,
                    "VolumeSizeInGB": 30,
                }
            },
            "RoleArn": "DummyRole",
        },
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
Example #5
0
def test_lambda_step(sagemaker_session):
    param = ParameterInteger(name="MyInt")
    outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String)
    outputParam2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean)
    lambda_step = LambdaStep(
        name="MyLambdaStep",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        inputs={"arg1": "foo", "arg2": 5, "arg3": param},
        outputs=[outputParam1, outputParam2],
    )
    lambda_step.add_depends_on(["SecondTestStep"])
    assert lambda_step.to_request() == {
        "Name": "MyLambdaStep",
        "Type": "Lambda",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
        "OutputParameters": [
            {"OutputName": "output1", "OutputType": "String"},
            {"OutputName": "output2", "OutputType": "Boolean"},
        ],
        "Arguments": {"arg1": "foo", "arg2": 5, "arg3": param},
    }
Example #6
0
def test_join_expressions():
    assert Join(
        values=[
            "foo",
            ParameterFloat(name="MyFloat"),
            ParameterInteger(name="MyInt"),
            ParameterString(name="MyStr"),
            Properties(path="Steps.foo.OutputPath.S3Uri"),
            ExecutionVariables.PIPELINE_EXECUTION_ID,
            Join(on=",", values=[1, "a", False, 1.1]),
        ]
    ).expr == {
        "Std:Join": {
            "On": "",
            "Values": [
                "foo",
                {"Get": "Parameters.MyFloat"},
                {"Get": "Parameters.MyInt"},
                {"Get": "Parameters.MyStr"},
                {"Get": "Steps.foo.OutputPath.S3Uri"},
                {"Get": "Execution.PipelineExecutionId"},
                {"Std:Join": {"On": ",", "Values": [1, "a", False, 1.1]}},
            ],
        },
    }
def test_callback_step_output_expr():
    param = ParameterInteger(name="MyInt")
    outputParam1 = CallbackOutput(output_name="output1",
                                  output_type=CallbackOutputTypeEnum.String)
    outputParam2 = CallbackOutput(output_name="output2",
                                  output_type=CallbackOutputTypeEnum.Boolean)
    cb_step = CallbackStep(
        name="MyCallbackStep",
        depends_on=["TestStep"],
        sqs_queue_url=
        "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={
            "arg1": "foo",
            "arg2": 5,
            "arg3": param
        },
        outputs=[outputParam1, outputParam2],
    )

    assert cb_step.properties.Outputs["output1"].expr == {
        "Get": "Steps.MyCallbackStep.OutputParameters['output1']"
    }
    assert cb_step.properties.Outputs["output2"].expr == {
        "Get": "Steps.MyCallbackStep.OutputParameters['output2']"
    }
Example #8
0
def test_fail_step_with_join_fn_in_error_message():
    param = ParameterInteger(name="MyInt", default_value=2)
    cond = ConditionEquals(left=param, right=1)
    step_cond = ConditionStep(
        name="CondStep",
        conditions=[cond],
        if_steps=[],
        else_steps=[],
    )
    step_fail = FailStep(
        name="FailStep",
        error_message=Join(on=": ",
                           values=[
                               "Failed due to xxx == yyy returns",
                               step_cond.properties.Outcome
                           ]),
    )
    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step_cond, step_fail],
        parameters=[param],
    )

    _expected_dsl = [
        {
            "Name": "CondStep",
            "Type": "Condition",
            "Arguments": {
                "Conditions": [{
                    "Type": "Equals",
                    "LeftValue": {
                        "Get": "Parameters.MyInt"
                    },
                    "RightValue": 1
                }],
                "IfSteps": [],
                "ElseSteps": [],
            },
        },
        {
            "Name": "FailStep",
            "Type": "Fail",
            "Arguments": {
                "ErrorMessage": {
                    "Std:Join": {
                        "On":
                        ": ",
                        "Values": [
                            "Failed due to xxx == yyy returns",
                            {
                                "Get": "Steps.CondStep.Outcome"
                            },
                        ],
                    }
                }
            },
        },
    ]

    assert json.loads(pipeline.definition())["Steps"] == _expected_dsl
def test_invalid_pipeline_depended_on_fail_step(sagemaker_session, role,
                                                pipeline_name):
    param = ParameterInteger(name="MyInt", default_value=2)
    cond = ConditionEquals(left=param, right=1)
    step_fail = FailStep(
        name="FailStep",
        error_message="Failed pipeline execution",
    )
    step_cond = ConditionStep(
        name="CondStep",
        conditions=[cond],
        if_steps=[],
        else_steps=[],
        depends_on=["FailStep"],
    )
    pipeline = Pipeline(
        name=pipeline_name,
        steps=[step_cond, step_fail],
        sagemaker_session=sagemaker_session,
        parameters=[param],
    )

    try:
        with pytest.raises(Exception) as error:
            pipeline.create(role)

        assert "CondStep can not depends on FailStep" in str(error.value)
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_two_step_fail_pipeline_with_str_err_msg(sagemaker_session, role,
                                                 pipeline_name):
    param = ParameterInteger(name="MyInt", default_value=2)
    cond = ConditionEquals(left=param, right=1)
    step_fail = FailStep(
        name="FailStep",
        error_message="Failed due to hitting in else branch",
    )
    step_cond = ConditionStep(
        name="CondStep",
        conditions=[cond],
        if_steps=[],
        else_steps=[step_fail],
    )
    pipeline = Pipeline(
        name=pipeline_name,
        steps=[step_cond],
        sagemaker_session=sagemaker_session,
        parameters=[param],
    )

    try:
        response = pipeline.create(role)
        pipeline_arn = response["PipelineArn"]
        execution = pipeline.start(parameters={})
        response = execution.describe()
        assert response["PipelineArn"] == pipeline_arn

        try:
            execution.wait(delay=30, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 2
        for execution_step in execution_steps:
            if execution_step["StepName"] == "CondStep":
                assert execution_step["StepStatus"] == "Succeeded"
                continue
            assert execution_step["StepName"] == "FailStep"
            assert execution_step["StepStatus"] == "Failed"
            assert execution_step[
                "FailureReason"] == "Failed due to hitting in else branch"
            metadata = execution_steps[0]["Metadata"]["Fail"]
            assert metadata[
                "ErrorMessage"] == "Failed due to hitting in else branch"

        # Check FailureReason field in ListPipelineExecutions
        executions = sagemaker_session.sagemaker_client.list_pipeline_executions(
            PipelineName=pipeline.name)["PipelineExecutionSummaries"]

        assert len(executions) == 1
        assert executions[0]["PipelineExecutionStatus"] == "Failed"
        assert ("Step failure: One or multiple steps failed"
                in executions[0]["PipelineExecutionFailureReason"])
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_parameter_integer_implicit_value():
    param = ParameterInteger("MyInteger", 1)

    with pytest.raises(TypeError) as error:
        int(param)

    assert str(error.value) == "Pipeline variables do not support __int__ operation."
def test_one_step_lambda_pipeline(sagemaker_session, role, pipeline_name,
                                  region_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)

    outputParam1 = LambdaOutput(output_name="output1",
                                output_type=LambdaOutputTypeEnum.String)
    step_lambda = LambdaStep(
        name="lambda-step",
        lambda_func=Lambda(
            function_arn=
            ("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"
             ),
            session=sagemaker_session,
        ),
        inputs={"arg1": "foo"},
        outputs=[outputParam1],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_lambda],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Example #13
0
def test_add_func_of_join():
    func_join1 = Join(values=[1, "a"])
    param = ParameterInteger(name="MyInteger", default_value=3)

    with pytest.raises(TypeError) as error:
        func_join1 + param

    assert str(error.value) == "Pipeline variables do not support concatenation."
Example #14
0
def test_condition_equals():
    param = ParameterInteger(name="MyInt")
    cond = ConditionEquals(left=param, right=1)
    assert cond.to_request() == {
        "Type": "Equals",
        "LeftValue": {
            "Get": "Parameters.MyInt"
        },
        "RightValue": 1,
    }
def test_large_pipeline(sagemaker_session, role, pipeline_name, region_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)

    outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String)

    callback_steps = [
        CallbackStep(
            name=f"callback-step{count}",
            sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
            inputs={"arg1": "foo"},
            outputs=[outputParam],
        )
        for count in range(2000)
    ]
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=callback_steps,
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )
        response = pipeline.describe()
        assert len(json.loads(pipeline.describe()["PipelineDefinition"])["Steps"]) == 2000

        pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Example #16
0
def test_add_depends_on(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri",
        default_value=f"s3://{BUCKET}/processing_manifest")
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

    step_1 = ProcessingStep(
        name="MyProcessingStep-1",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_2 = ProcessingStep(
        name="MyProcessingStep-2",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_3 = ProcessingStep(
        name="MyProcessingStep-3",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    step_3.add_depends_on([step_2.name])

    assert "DependsOn" not in step_1.to_request()
    assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"]
    assert step_3.to_request()["DependsOn"] == [
        "MyProcessingStep-1", "MyProcessingStep-2"
    ]
def test_two_steps_emr_pipeline(sagemaker_session, role, pipeline_name,
                                region_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)

    emr_step_config = EMRStepConfig(
        jar=
        "s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar",
        args=["dummy_emr_script_path"],
    )

    step_emr_1 = EMRStep(
        name="emr-step-1",
        cluster_id="j-1YONHTCP3YZKC",
        display_name="emr_step_1",
        description="MyEMRStepDescription",
        step_config=emr_step_config,
    )

    step_emr_2 = EMRStep(
        name="emr-step-2",
        cluster_id=step_emr_1.properties.ClusterId,
        display_name="emr_step_2",
        description="MyEMRStepDescription",
        step_config=emr_step_config,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_emr_1, step_emr_2],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_condition_step():
    param = ParameterInteger(name="MyInt")
    cond = ConditionEquals(left=param, right=1)
    step1 = CustomStep("MyStep1")
    step2 = CustomStep("MyStep2")
    cond_step = ConditionStep(
        name="MyConditionStep",
        depends_on=["TestStep"],
        conditions=[cond],
        if_steps=[step1],
        else_steps=[step2],
    )
    cond_step.add_depends_on(["SecondTestStep"])
    assert cond_step.to_request() == {
        "Name": "MyConditionStep",
        "Type": "Condition",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "Arguments": {
            "Conditions": [
                {
                    "Type": "Equals",
                    "LeftValue": {
                        "Get": "Parameters.MyInt"
                    },
                    "RightValue": 1,
                },
            ],
            "IfSteps": [
                {
                    "Name": "MyStep1",
                    "Type": "Training",
                    "Arguments": {},
                },
            ],
            "ElseSteps": [{
                "Name": "MyStep2",
                "Type": "Training",
                "Arguments": {},
            }],
        },
    }
    assert cond_step.properties.Outcome.expr == {
        "Get": "Steps.MyConditionStep.Outcome"
    }
def test_callback_step():
    param = ParameterInteger(name="MyInt")
    outputParam1 = CallbackOutput(output_name="output1",
                                  output_type=CallbackOutputTypeEnum.String)
    outputParam2 = CallbackOutput(output_name="output2",
                                  output_type=CallbackOutputTypeEnum.Boolean)
    cb_step = CallbackStep(
        name="MyCallbackStep",
        depends_on=["TestStep"],
        sqs_queue_url=
        "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={
            "arg1": "foo",
            "arg2": 5,
            "arg3": param
        },
        outputs=[outputParam1, outputParam2],
    )
    cb_step.add_depends_on(["SecondTestStep"])
    assert cb_step.to_request() == {
        "Name":
        "MyCallbackStep",
        "Type":
        "Callback",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "SqsQueueUrl":
        "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        "OutputParameters": [
            {
                "OutputName": "output1",
                "OutputType": "String"
            },
            {
                "OutputName": "output2",
                "OutputType": "Boolean"
            },
        ],
        "Arguments": {
            "arg1": "foo",
            "arg2": 5,
            "arg3": param
        },
    }
def test_add_func():
    param_str = ParameterString(name="MyString", default_value="s3://foo/bar/baz.csv")
    param_int = ParameterInteger(name="MyInteger", default_value=3)
    param_float = ParameterFloat(name="MyFloat", default_value=1.5)
    param_bool = ParameterBoolean(name="MyBool")

    with pytest.raises(TypeError) as error:
        param_str + param_int
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_int + param_float
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_float + param_bool
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_bool + param_str
    assert str(error.value) == "Pipeline variables do not support concatenation."
def test_lambda_step_output_expr(sagemaker_session):
    param = ParameterInteger(name="MyInt")
    outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String)
    outputParam2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean)
    lambda_step = LambdaStep(
        name="MyLambdaStep",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        inputs={"arg1": "foo", "arg2": 5, "arg3": param},
        outputs=[outputParam1, outputParam2],
    )

    assert lambda_step.properties.Outputs["output1"].expr == {
        "Get": "Steps.MyLambdaStep.OutputParameters['output1']"
    }
    assert lambda_step.properties.Outputs["output2"].expr == {
        "Get": "Steps.MyLambdaStep.OutputParameters['output2']"
    }
def test_lambda_step(sagemaker_session):
    param = ParameterInteger(name="MyInt")
    output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String)
    output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean)
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    lambda_step = LambdaStep(
        name="MyLambdaStep",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        display_name="MyLambdaStep",
        description="MyLambdaStepDescription",
        inputs={"arg1": "foo", "arg2": 5, "arg3": param},
        outputs=[output_param1, output_param2],
        cache_config=cache_config,
    )
    lambda_step.add_depends_on(["SecondTestStep"])
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[param],
        steps=[lambda_step],
        sagemaker_session=sagemaker_session,
    )
    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyLambdaStep",
        "Type": "Lambda",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "DisplayName": "MyLambdaStep",
        "Description": "MyLambdaStepDescription",
        "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
        "OutputParameters": [
            {"OutputName": "output1", "OutputType": "String"},
            {"OutputName": "output2", "OutputType": "Boolean"},
        ],
        "Arguments": {"arg1": "foo", "arg2": 5, "arg3": {"Get": "Parameters.MyInt"}},
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
def test_parameter_integer_implicit_value():
    param = ParameterInteger("MyInteger")
    assert param.__int__() == 0
    param1 = ParameterInteger("MyInteger", 1)
    assert param1.__int__() == 1
    param2 = ParameterInteger("MyInteger", default_value=2)
    assert param2.__int__() == 2
    param3 = ParameterInteger(name="MyInteger", default_value=3)
    assert param3.__int__() == 3
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role,
                                                    pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow")
    input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv")

    output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default"
    output_content_type = "CSV"
    output_config = {output_name: {"content_type": output_content_type}}
    job_argument = [f"--output-config '{json.dumps(output_config)}'"]

    inputs = [
        ProcessingInput(
            input_name="dummy_data.csv",
            source=input_file_path,
            destination="/opt/ml/processing/dummy_data.csv",
        )
    ]

    output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output"
    outputs = [
        ProcessingOutput(
            output_name=output_name,
            source="/opt/ml/processing/output",
            destination=output_s3_uri,
            s3_upload_mode="EndOfJob",
        )
    ]

    data_wrangler_processor = DataWranglerProcessor(
        role=role,
        data_wrangler_flow_source=recipe_file_path,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        max_runtime_in_seconds=86400,
    )

    data_wrangler_step = ProcessingStep(
        name="data-wrangler-step",
        processor=data_wrangler_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=job_argument,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[data_wrangler_step],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    expected_image_uri = image_uris.retrieve(
        "data-wrangler", region=sagemaker_session.boto_region_name)
    assert len(definition["Steps"]) == 1
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] is not None
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] == expected_image_uri

    assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None
    processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"]
    assert len(processing_inputs) == 2
    for processing_input in processing_inputs:
        if processing_input["InputName"] == "flow":
            assert processing_input["S3Input"]["S3Uri"].endswith(".flow")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/flow"
        elif processing_input["InputName"] == "dummy_data.csv":
            assert processing_input["S3Input"]["S3Uri"].endswith(".csv")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/dummy_data.csv"
        else:
            raise AssertionError("Unknown input name")
    assert definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"] is not None
    processing_outputs = definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"]["Outputs"]
    assert len(processing_outputs) == 1
    assert processing_outputs[0]["OutputName"] == output_name
    assert processing_outputs[0]["S3Output"] is not None
    assert processing_outputs[0]["S3Output"][
        "LocalPath"] == "/opt/ml/processing/output"
    assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=60, max_attempts=10)
        except WaiterError:
            pass

        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "data-wrangler-step"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_two_processing_job_depends_on(
    sagemaker_session,
    role,
    pipeline_name,
    region_name,
    cpu_instance_type,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")

    pyspark_processor = PySparkProcessor(
        base_job_name="sm-spark",
        framework_version="2.4",
        role=role,
        instance_count=instance_count,
        instance_type=cpu_instance_type,
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )

    spark_run_args = pyspark_processor.get_run_args(
        submit_app=script_path,
        arguments=[
            "--s3_input_bucket",
            sagemaker_session.default_bucket(),
            "--s3_input_key_prefix",
            "spark-input",
            "--s3_output_bucket",
            sagemaker_session.default_bucket(),
            "--s3_output_key_prefix",
            "spark-output",
        ],
    )

    step_pyspark_1 = ProcessingStep(
        name="pyspark-process-1",
        processor=pyspark_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
    )

    step_pyspark_2 = ProcessingStep(
        name="pyspark-process-2",
        depends_on=[step_pyspark_1],
        processor=pyspark_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark_1, step_pyspark_2],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=60)
        except WaiterError:
            pass

        execution_steps = execution.list_steps()
        assert len(execution_steps) == 2
        time_stamp = {}
        for execution_step in execution_steps:
            name = execution_step["StepName"]
            if name == "pyspark-process-1":
                time_stamp[name] = execution_step["EndTime"]
            else:
                time_stamp[name] = execution_step["StartTime"]
        assert time_stamp["pyspark-process-1"] < time_stamp["pyspark-process-2"]
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_one_step_sparkjar_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
    configuration,
    build_jar,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
    spark_path = os.path.join(DATA_DIR, "spark")

    spark_jar_processor = SparkJarProcessor(
        role=role,
        instance_count=2,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version="2.4",
    )
    bucket = spark_jar_processor.sagemaker_session.default_bucket()
    with open(os.path.join(spark_path, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session,
        )
    output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}"

    java_project_dir = os.path.join(spark_path, "code", "java",
                                    "hello-java-spark")
    spark_run_args = spark_jar_processor.get_run_args(
        submit_app=f"{java_project_dir}/hello-spark-java.jar",
        submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )

    step_pyspark = ProcessingStep(
        name="sparkjar-process",
        processor=spark_jar_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sparkjar-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_one_step_pyspark_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")

    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    pyspark_processor = PySparkProcessor(
        base_job_name="sm-spark",
        framework_version="2.4",
        role=role,
        instance_count=instance_count,
        instance_type=cpu_instance_type,
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )

    spark_run_args = pyspark_processor.get_run_args(
        submit_app=script_path,
        arguments=[
            "--s3_input_bucket",
            sagemaker_session.default_bucket(),
            "--s3_input_key_prefix",
            "spark-input",
            "--s3_output_bucket",
            sagemaker_session.default_bucket(),
            "--s3_output_key_prefix",
            "spark-output",
        ],
    )

    step_pyspark = ProcessingStep(
        name="pyspark-process",
        processor=pyspark_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "pyspark-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_one_step_framework_processing_pipeline(
    sagemaker_session,
    role,
    sklearn_latest_version,
    cpu_instance_type,
    pipeline_name,
    region_name,
    athena_dataset_definition,
):
    """Use `SKLearnProcessor` to test `FrameworkProcessor`."""
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    inputs = [
        ProcessingInput(source=input_file_path,
                        destination="/opt/ml/processing/inputs/"),
        ProcessingInput(dataset_definition=athena_dataset_definition),
    ]

    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_latest_version,
        role=role,
        instance_type=cpu_instance_type,
        instance_count=instance_count,
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn",
    )

    run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs)

    step_sklearn = ProcessingStep(
        name="sklearn-process",
        processor=sklearn_processor,
        inputs=run_args.inputs,
        outputs=run_args.outputs,
        job_arguments=run_args.arguments,
        code=run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_sklearn],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sklearn-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def get_pipeline(
        region,
        role=None,
        default_bucket=None,
        model_package_group_name="CustomerChurnPackageGroup",  # Choose any name
        pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
        base_job_prefix="CustomerChurn",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value=
        "PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sm-pipelines-demo-data-123456789/churn.txt",  # Change this to point to the s3 location of your raw input data.
    )

    # Processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=
        f"{base_job_prefix}/sklearn-CustomerChurn-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="CustomerChurnProcess",  # choose any name
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework=
        "xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/CustomerChurn-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="binary:logistic",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="CustomerChurnTrain",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # Processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="CustomerChurnEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # Register model step that will be conditionally executed
    model_metrics = ModelMetrics(model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
        content_type="application/json",
    ))

    # Register model step that will be conditionally executed
    step_register = RegisterModel(
        name="CustomerChurnRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path=
            "binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=0.8,  # You can change the threshold here
    )
    step_cond = ConditionStep(
        name="CustomerChurnAccuracyCond",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="TestPackageGroup",
    pipeline_name="TestPipeline",
    base_job_prefix="Test",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval")
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv",
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-test-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessTestData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/test-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="TrainTestModel",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-test-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="TestEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateTestModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
                                       content_type="application/json"))
    step_register = RegisterModel(
        name="RegisterTestModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(step=step_eval,
                     property_file=evaluation_report,
                     json_path="regression_metrics.mse.value"),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSETestEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline