コード例 #1
0
def test_pipeline_start(sagemaker_session_mock):
    sagemaker_session_mock.sagemaker_client.start_pipeline_execution.return_value = {
        "PipelineExecutionArn": "my:arn"
    }
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[
            ParameterString("alpha", "beta"),
            ParameterString("gamma", "delta")
        ],
        steps=[],
        sagemaker_session=sagemaker_session_mock,
    )
    pipeline.start()
    assert sagemaker_session_mock.start_pipeline_execution.called_with(
        PipelineName="MyPipeline", )

    pipeline.start(execution_display_name="pipeline-execution")
    assert sagemaker_session_mock.start_pipeline_execution.called_with(
        PipelineName="MyPipeline",
        PipelineExecutionDisplayName="pipeline-execution")

    pipeline.start(parameters=dict(alpha="epsilon"))
    assert sagemaker_session_mock.start_pipeline_execution.called_with(
        PipelineName="MyPipeline",
        PipelineParameters=[{
            "Name": "alpha",
            "Value": "epsilon"
        }])
コード例 #2
0
def test_pipeline_execution_basics(sagemaker_session_mock):
    sagemaker_session_mock.sagemaker_client.start_pipeline_execution.return_value = {
        "PipelineExecutionArn": "my:arn"
    }
    sagemaker_session_mock.sagemaker_client.list_pipeline_execution_steps.return_value = {
        "PipelineExecutionSteps": [Mock()]
    }
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[ParameterString("alpha", "beta"), ParameterString("gamma", "delta")],
        steps=[],
        sagemaker_session=sagemaker_session_mock,
    )
    execution = pipeline.start()
    execution.stop()
    assert sagemaker_session_mock.sagemaker_client.stop_pipeline_execution.called_with(
        PipelineExecutionArn="my:arn"
    )
    execution.describe()
    assert sagemaker_session_mock.sagemaker_client.describe_pipeline_execution.called_with(
        PipelineExecutionArn="my:arn"
    )
    steps = execution.list_steps()
    assert sagemaker_session_mock.sagemaker_client.describe_pipeline_execution_steps.called_with(
        PipelineExecutionArn="my:arn"
    )
    assert len(steps) == 1
コード例 #3
0
def test_processing_step(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest"
    )
    instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    assert step.to_request() == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": {
            "AppSpecification": {"ImageUri": "fakeimage"},
            "ProcessingInputs": [
                {
                    "InputName": "input-1",
                    "AppManaged": False,
                    "S3Input": {
                        "LocalPath": "processing_manifest",
                        "S3CompressionType": "None",
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3InputMode": "File",
                        "S3Uri": processing_input_data_uri_parameter,
                    },
                }
            ],
            "ProcessingResources": {
                "ClusterConfig": {
                    "InstanceCount": instance_count_parameter,
                    "InstanceType": instance_type_parameter,
                    "VolumeSizeInGB": 30,
                }
            },
            "RoleArn": "DummyRole",
        },
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
コード例 #4
0
def test_parameter_to_string_and_string_implicit_value():
    param = ParameterString("MyString", "1")

    assert param.to_string() == param

    with pytest.raises(TypeError) as error:
        str(param)

    assert str(error.value) == "Pipeline variables do not support __str__ operation."
コード例 #5
0
def test_pipeline_start_before_creation(sagemaker_session_mock):
    sagemaker_session_mock.sagemaker_client.describe_pipeline.side_effect = ClientError({}, "bar")
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[ParameterString("alpha", "beta"), ParameterString("gamma", "delta")],
        steps=[],
        sagemaker_session=sagemaker_session_mock,
    )
    with pytest.raises(ValueError):
        pipeline.start()
コード例 #6
0
def test_add_depends_on(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri",
        default_value=f"s3://{BUCKET}/processing_manifest")
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

    step_1 = ProcessingStep(
        name="MyProcessingStep-1",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_2 = ProcessingStep(
        name="MyProcessingStep-2",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_3 = ProcessingStep(
        name="MyProcessingStep-3",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    step_3.add_depends_on([step_2.name])

    assert "DependsOn" not in step_1.to_request()
    assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"]
    assert step_3.to_request()["DependsOn"] == [
        "MyProcessingStep-1", "MyProcessingStep-2"
    ]
コード例 #7
0
def get_pipeline(region, role, image_uri, model_path):

    session = get_session(region)
    if role is None:
        role = sagemaker.session.get_execution_role(session)

    train_data_param = ParameterString(name='train-data')
    validation_data_param = ParameterString(name='validation-data')
    image_uri_param = ParameterString(name='image-uri')
    model_path_param = ParameterString(name='model-path')

    estimator = Estimator(image_uri=image_uri,
                          instance_type='ml.m5.xlarge',
                          instance_count=1,
                          output_path=model_path,
                          sagemaker_session=session,
                          role=role)

    ### Your Pipeline definition goes here ....
    ###########################################

    step_train = TrainingStep(name="iris-model-train",
                              estimator=estimator,
                              inputs={
                                  "train":
                                  TrainingInput(s3_data=train_data_param,
                                                content_type='text/csv'),
                                  "validation":
                                  TrainingInput(s3_data=validation_data_param,
                                                content_type='text/csv')
                              })

    step_register = RegisterModel(
        name='iris-model-register',
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        estimator=estimator,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name='iris-model')

    pipeline = Pipeline(name='iris-model-pipeline',
                        steps=[step_train, step_register],
                        parameters=[train_data_param, validation_data_param],
                        sagemaker_session=session)

    ### end of Pipeline definition
    ###########################################

    return pipeline
コード例 #8
0
def test_condition_or():
    var = ExecutionVariables.START_DATETIME
    cond = ConditionGreaterThan(left=var, right="2020-12-01")
    param = ParameterString(name="MyStr")
    cond_in = ConditionIn(value=param, in_values=["abc", "def"])
    cond_or = ConditionOr(conditions=[cond, cond_in])
    assert cond_or.to_request() == {
        "Type":
        "Or",
        "Conditions": [
            {
                "Type": "GreaterThan",
                "LeftValue": {
                    "Get": "Execution.StartDateTime"
                },
                "RightValue": "2020-12-01",
            },
            {
                "Type": "In",
                "QueryValue": {
                    "Get": "Parameters.MyStr"
                },
                "Values": ["abc", "def"],
            },
        ],
    }
コード例 #9
0
def test_join_expressions():
    assert Join(
        values=[
            "foo",
            ParameterFloat(name="MyFloat"),
            ParameterInteger(name="MyInt"),
            ParameterString(name="MyStr"),
            Properties(path="Steps.foo.OutputPath.S3Uri"),
            ExecutionVariables.PIPELINE_EXECUTION_ID,
            Join(on=",", values=[1, "a", False, 1.1]),
        ]
    ).expr == {
        "Std:Join": {
            "On": "",
            "Values": [
                "foo",
                {"Get": "Parameters.MyFloat"},
                {"Get": "Parameters.MyInt"},
                {"Get": "Parameters.MyStr"},
                {"Get": "Steps.foo.OutputPath.S3Uri"},
                {"Get": "Execution.PipelineExecutionId"},
                {"Std:Join": {"On": ",", "Values": [1, "a", False, 1.1]}},
            ],
        },
    }
コード例 #10
0
def test_pipeline_interpolates_lambda_outputs(sagemaker_session):
    parameter = ParameterString("MyStr")
    output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String)
    output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.String)
    lambda_step1 = LambdaStep(
        name="MyLambdaStep1",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        inputs={"arg1": "foo"},
        outputs=[output_param1],
    )
    lambda_step2 = LambdaStep(
        name="MyLambdaStep2",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        inputs={"arg1": output_param1},
        outputs=[output_param2],
    )

    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[lambda_step1, lambda_step2],
        sagemaker_session=sagemaker_session,
    )

    assert json.loads(pipeline.definition()) == {
        "Version": "2020-12-01",
        "Metadata": {},
        "Parameters": [{"Name": "MyStr", "Type": "String"}],
        "PipelineExperimentConfig": {
            "ExperimentName": {"Get": "Execution.PipelineName"},
            "TrialName": {"Get": "Execution.PipelineExecutionId"},
        },
        "Steps": [
            {
                "Name": "MyLambdaStep1",
                "Type": "Lambda",
                "Arguments": {"arg1": "foo"},
                "DependsOn": ["TestStep"],
                "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
                "OutputParameters": [{"OutputName": "output1", "OutputType": "String"}],
            },
            {
                "Name": "MyLambdaStep2",
                "Type": "Lambda",
                "Arguments": {"arg1": {"Get": "Steps.MyLambdaStep1.OutputParameters['output1']"}},
                "DependsOn": ["TestStep"],
                "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
                "OutputParameters": [{"OutputName": "output2", "OutputType": "String"}],
            },
        ],
    }
コード例 #11
0
def test_condition_in():
    param = ParameterString(name="MyStr")
    cond_in = ConditionIn(value=param, in_values=["abc", "def"])
    assert cond_in.to_request() == {
        "Type": "In",
        "QueryValue": {
            "Get": "Parameters.MyStr"
        },
        "Values": ["abc", "def"],
    }
コード例 #12
0
ファイル: pipeline.py プロジェクト: ryankarlos/AWS-ML
def get_pipeline(
    region,
    default_bucket='sagemaker-us-east-1-376337229415',
    model_package_group_name="AbaloneExample", 
    pipeline_name="AbaloneExample",  
    base_job_prefix="abalone", 
):
    """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data.
    Args:
        region: AWS region to create and run the pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    role = sagemaker.session.get_execution_role(sagemaker_session)
        
    input_data = ParameterString(
    name="InputDataUrl", 
    default_value='s3://{}/abalone/abalone-dataset.csv'.format(default_bucket),  
        )
    
    step_process = processing_job(processing_instance_type, 
                                  processing_instance_count, 
                                  sagemaker_session, 
                                  role)
    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/Abalonetrain"
    
    image_uri = get_image_uri(framework="xgboost", 
                              region=region, 
                              version="1.0-1", 
                              py_version="py3", 
                              training_instance_type=training_instance_type)

    
    step_train = training_job(image_uri, 
                              training_instance_type, 
                              training_instance_count, 
                              model_path, 
                              sagemaker_session, role)
    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
コード例 #13
0
def test_pipeline_basic():
    parameter = ParameterString("MyStr")
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[CustomStep(name="MyStep", input_data=parameter)],
        sagemaker_session=sagemaker_session_mock,
    )
    assert pipeline.to_request() == {
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "PipelineExperimentConfig": {
            "ExperimentName": ExecutionVariables.PIPELINE_NAME,
            "TrialName": ExecutionVariables.PIPELINE_EXECUTION_ID,
        },
        "Steps": [{
            "Name": "MyStep",
            "Type": "Training",
            "Arguments": {
                "input_data": parameter
            }
        }],
    }
    assert ordered(json.loads(pipeline.definition())) == ordered({
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "PipelineExperimentConfig": {
            "ExperimentName": {
                "Get": "Execution.PipelineName"
            },
            "TrialName": {
                "Get": "Execution.PipelineExecutionId"
            },
        },
        "Steps": [{
            "Name": "MyStep",
            "Type": "Training",
            "Arguments": {
                "input_data": {
                    "Get": "Parameters.MyStr"
                }
            },
        }],
    })
コード例 #14
0
def test_pipeline_interpolates_callback_outputs():
    parameter = ParameterString("MyStr")
    outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String)
    outputParam2 = CallbackOutput(output_name="output2", output_type=CallbackOutputTypeEnum.String)
    cb_step1 = CallbackStep(
        name="MyCallbackStep1",
        depends_on=["TestStep"],
        sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={"arg1": "foo"},
        outputs=[outputParam1],
    )
    cb_step2 = CallbackStep(
        name="MyCallbackStep2",
        depends_on=["TestStep"],
        sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={"arg1": outputParam1},
        outputs=[outputParam2],
    )

    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[cb_step1, cb_step2],
        sagemaker_session=sagemaker_session_mock,
    )

    assert json.loads(pipeline.definition()) == {
        "Version": "2020-12-01",
        "Metadata": {},
        "Parameters": [{"Name": "MyStr", "Type": "String"}],
        "PipelineExperimentConfig": {
            "ExperimentName": {"Get": "Execution.PipelineName"},
            "TrialName": {"Get": "Execution.PipelineExecutionId"},
        },
        "Steps": [
            {
                "Name": "MyCallbackStep1",
                "Type": "Callback",
                "Arguments": {"arg1": "foo"},
                "DependsOn": ["TestStep"],
                "SqsQueueUrl": "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
                "OutputParameters": [{"OutputName": "output1", "OutputType": "String"}],
            },
            {
                "Name": "MyCallbackStep2",
                "Type": "Callback",
                "Arguments": {"arg1": {"Get": "Steps.MyCallbackStep1.OutputParameters['output1']"}},
                "DependsOn": ["TestStep"],
                "SqsQueueUrl": "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
                "OutputParameters": [{"OutputName": "output2", "OutputType": "String"}],
            },
        ],
    }
コード例 #15
0
def test_condition_less_than_or_equal_to():
    var = ExecutionVariables.START_DATETIME
    param = ParameterString(name="StartDateTime")
    cond = ConditionLessThanOrEqualTo(left=var, right=param)
    assert cond.to_request() == {
        "Type": "LessThanOrEqualTo",
        "LeftValue": {
            "Get": "Execution.StartDateTime"
        },
        "RightValue": {
            "Get": "Parameters.StartDateTime"
        },
    }
コード例 #16
0
def test_condition_not_in():
    param = ParameterString(name="MyStr")
    cond_in = ConditionIn(value=param, in_values=["abc", "def"])
    cond_not = ConditionNot(expression=cond_in)
    assert cond_not.to_request() == {
        "Type": "Not",
        "Expression": {
            "Type": "In",
            "QueryValue": {
                "Get": "Parameters.MyStr"
            },
            "Values": ["abc", "def"],
        },
    }
コード例 #17
0
def test_condition_not():
    param = ParameterString(name="MyStr")
    cond_eq = ConditionEquals(left=param, right="foo")
    cond_not = ConditionNot(expression=cond_eq)
    assert cond_not.to_request() == {
        "Type": "Not",
        "Expression": {
            "Type": "Equals",
            "LeftValue": {
                "Get": "Parameters.MyStr"
            },
            "RightValue": "foo",
        },
    }
コード例 #18
0
def test_transform_step_with_transformer(pipeline_session):
    model_name = ParameterString("ModelName")
    transformer = Transformer(
        model_name=model_name,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        output_path=f"s3://{pipeline_session.default_bucket()}/Transform",
        sagemaker_session=pipeline_session,
    )

    transform_inputs = TransformInput(
        data=f"s3://{pipeline_session.default_bucket()}/batch-data",
    )

    with warnings.catch_warnings(record=True) as w:
        step_args = transformer.transform(
            data=transform_inputs.data,
            data_type=transform_inputs.data_type,
            content_type=transform_inputs.content_type,
            compression_type=transform_inputs.compression_type,
            split_type=transform_inputs.split_type,
            input_filter=transform_inputs.input_filter,
            output_filter=transform_inputs.output_filter,
            join_source=transform_inputs.join_source,
            model_client_config=transform_inputs.model_client_config,
        )
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Running within a PipelineSession" in str(w[-1].message)

    with warnings.catch_warnings(record=True) as w:
        step = TransformStep(
            name="MyTransformStep",
            step_args=step_args,
        )
        assert len(w) == 0

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        parameters=[model_name],
        sagemaker_session=pipeline_session,
    )
    step_args["ModelName"] = model_name.expr
    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyTransformStep",
        "Type": "Transform",
        "Arguments": step_args,
    }
コード例 #19
0
def test_condition_in_mixed():
    param = ParameterString(name="MyStr")
    prop = Properties("foo")
    var = ExecutionVariables.START_DATETIME
    cond_in = ConditionIn(value=param, in_values=["abc", prop, var])
    assert cond_in.to_request() == {
        "Type": "In",
        "QueryValue": {
            "Get": "Parameters.MyStr"
        },
        "Values": ["abc", {
            "Get": "foo"
        }, {
            "Get": "Execution.StartDateTime"
        }],
    }
コード例 #20
0
def test_json_get_expressions_with_invalid_step_name():
    with pytest.raises(ValueError) as err:
        JsonGet(
            step_name="",
            property_file="my-property-file",
            json_path="my-json-path",
        ).expr

    assert "Please give a valid step name as a string" in str(err.value)

    with pytest.raises(ValueError) as err:
        JsonGet(
            step_name=ParameterString(name="MyString"),
            property_file="my-property-file",
            json_path="my-json-path",
        ).expr

    assert "Please give a valid step name as a string" in str(err.value)
コード例 #21
0
def test_pipeline_basic():
    parameter = ParameterString("MyStr")
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[CustomStep(name="MyStep", input_data=parameter)],
        sagemaker_session=sagemaker_session_mock,
    )
    assert pipeline.to_request() == {
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "Steps": [{
            "Name": "MyStep",
            "Type": "Training",
            "Arguments": {
                "input_data": parameter
            }
        }],
    }
    assert ordered(json.loads(pipeline.definition())) == ordered({
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "Steps": [{
            "Name": "MyStep",
            "Type": "Training",
            "Arguments": {
                "input_data": {
                    "Get": "Parameters.MyStr"
                }
            },
        }],
    })
コード例 #22
0
def test_data_quality_check_step(
    sagemaker_session,
    check_job_config,
    model_package_group_name,
    supplied_baseline_statistics_uri,
    supplied_baseline_constraints_uri,
):
    data_quality_check_config = DataQualityCheckConfig(
        baseline_dataset=ParameterString(name="BaselineDataset"),
        dataset_format=DatasetFormat.csv(header=True),
        output_s3_uri="s3://...",
        record_preprocessor_script=
        "s3://my_bucket/data_quality/preprocessor.py",
        post_analytics_processor_script=
        "s3://my_bucket/data_quality/postprocessor.py",
    )
    data_quality_check_step = QualityCheckStep(
        name="DataQualityCheckStep",
        skip_check=False,
        register_new_baseline=False,
        quality_check_config=data_quality_check_config,
        check_job_config=check_job_config,
        model_package_group_name=model_package_group_name,
        supplied_baseline_statistics=supplied_baseline_statistics_uri,
        supplied_baseline_constraints=supplied_baseline_constraints_uri,
        cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"),
    )
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[
            supplied_baseline_statistics_uri,
            supplied_baseline_constraints_uri,
            model_package_group_name,
        ],
        steps=[data_quality_check_step],
        sagemaker_session=sagemaker_session,
    )
    step_definition = _get_step_definition_for_test(
        pipeline, ["baseline_dataset_input", "quality_check_output"])

    assert step_definition == _expected_data_quality_dsl
コード例 #23
0
def test_parameter_string_with_enum_values():
    param = ParameterString("MyString", enum_values=["a", "b"])
    assert param.to_request() == {"Name": "MyString", "Type": "String", "EnumValues": ["a", "b"]}
    assert param.expr == {"Get": "Parameters.MyString"}
    assert param.parameter_type.python_type == str

    param = ParameterString("MyString", default_value="a", enum_values=["a", "b"])
    assert param.to_request() == {
        "Name": "MyString",
        "Type": "String",
        "DefaultValue": "a",
        "EnumValues": ["a", "b"],
    }
    assert param.expr == {"Get": "Parameters.MyString"}
    assert param.parameter_type.python_type == str
コード例 #24
0
def test_add_func():
    param_str = ParameterString(name="MyString", default_value="s3://foo/bar/baz.csv")
    param_int = ParameterInteger(name="MyInteger", default_value=3)
    param_float = ParameterFloat(name="MyFloat", default_value=1.5)
    param_bool = ParameterBoolean(name="MyBool")

    with pytest.raises(TypeError) as error:
        param_str + param_int
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_int + param_float
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_float + param_bool
    assert str(error.value) == "Pipeline variables do not support concatenation."

    with pytest.raises(TypeError) as error:
        param_bool + param_str
    assert str(error.value) == "Pipeline variables do not support concatenation."
コード例 #25
0
def test_parameter_string_with_enum_values():
    param = ParameterString("MyString", enum_values=["a", "b"])
    assert param.to_request() == {
        "Name": "MyString",
        "Type": "String",
        "EnumValues": ["a", "b"]
    }
    param = ParameterString("MyString",
                            default_value="a",
                            enum_values=["a", "b"])
    assert param.to_request() == {
        "Name": "MyString",
        "Type": "String",
        "DefaultValue": "a",
        "EnumValues": ["a", "b"],
    }
コード例 #26
0
def test_large_pipeline_update(sagemaker_session_mock, role_arn):
    parameter = ParameterString("MyStr")
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[CustomStep(name="MyStep", input_data=parameter)] * 2000,
        sagemaker_session=sagemaker_session_mock,
    )

    s3.S3Uploader.upload_string_as_file_body = Mock()

    pipeline.create(role_arn=role_arn)

    assert s3.S3Uploader.upload_string_as_file_body.called_with(
        body=pipeline.definition(), s3_uri="s3://s3_bucket/MyPipeline"
    )

    assert sagemaker_session_mock.sagemaker_client.update_pipeline.called_with(
        PipelineName="MyPipeline",
        PipelineDefinitionS3Location={"Bucket": "s3_bucket", "ObjectKey": "MyPipeline"},
        RoleArn=role_arn,
    )
コード例 #27
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="TestPackageGroup",
    pipeline_name="TestPipeline",
    base_job_prefix="Test",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval")
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv",
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-test-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessTestData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/test-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="TrainTestModel",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-test-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="TestEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateTestModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
                                       content_type="application/json"))
    step_register = RegisterModel(
        name="RegisterTestModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(step=step_eval,
                     property_file=evaluation_report,
                     json_path="regression_metrics.mse.value"),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSETestEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
コード例 #28
0
def test_parameter_string_implicit_value():
    param = ParameterString("MyString")
    assert param.__str__() == ""
    param1 = ParameterString("MyString", "1")
    assert param1.__str__() == "1"
    param2 = ParameterString("MyString", default_value="2")
    assert param2.__str__() == "2"
    param3 = ParameterString(name="MyString", default_value="3")
    assert param3.__str__() == "3"
    param3 = ParameterString(name="MyString",
                             default_value="3",
                             enum_values=["3"])
    assert param3.__str__() == "3"
コード例 #29
0
def test_parsable_parameter_string():
    param = ParameterString("MyString", default_value="s3://foo/bar/baz.csv")
    assert urlparse(param).scheme == "s3"
コード例 #30
0
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role,
                                                    pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow")
    input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv")

    output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default"
    output_content_type = "CSV"
    output_config = {output_name: {"content_type": output_content_type}}
    job_argument = [f"--output-config '{json.dumps(output_config)}'"]

    inputs = [
        ProcessingInput(
            input_name="dummy_data.csv",
            source=input_file_path,
            destination="/opt/ml/processing/dummy_data.csv",
        )
    ]

    output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output"
    outputs = [
        ProcessingOutput(
            output_name=output_name,
            source="/opt/ml/processing/output",
            destination=output_s3_uri,
            s3_upload_mode="EndOfJob",
        )
    ]

    data_wrangler_processor = DataWranglerProcessor(
        role=role,
        data_wrangler_flow_source=recipe_file_path,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        max_runtime_in_seconds=86400,
    )

    data_wrangler_step = ProcessingStep(
        name="data-wrangler-step",
        processor=data_wrangler_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=job_argument,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[data_wrangler_step],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    expected_image_uri = image_uris.retrieve(
        "data-wrangler", region=sagemaker_session.boto_region_name)
    assert len(definition["Steps"]) == 1
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] is not None
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] == expected_image_uri

    assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None
    processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"]
    assert len(processing_inputs) == 2
    for processing_input in processing_inputs:
        if processing_input["InputName"] == "flow":
            assert processing_input["S3Input"]["S3Uri"].endswith(".flow")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/flow"
        elif processing_input["InputName"] == "dummy_data.csv":
            assert processing_input["S3Input"]["S3Uri"].endswith(".csv")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/dummy_data.csv"
        else:
            raise AssertionError("Unknown input name")
    assert definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"] is not None
    processing_outputs = definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"]["Outputs"]
    assert len(processing_outputs) == 1
    assert processing_outputs[0]["OutputName"] == output_name
    assert processing_outputs[0]["S3Output"] is not None
    assert processing_outputs[0]["S3Output"][
        "LocalPath"] == "/opt/ml/processing/output"
    assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=60, max_attempts=10)
        except WaiterError:
            pass

        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "data-wrangler-step"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass