def test_pipeline_interpolates_callback_outputs():
    parameter = ParameterString("MyStr")
    outputParam1 = CallbackOutput(output_name="output1",
                                  output_type=CallbackOutputTypeEnum.String)
    outputParam2 = CallbackOutput(output_name="output2",
                                  output_type=CallbackOutputTypeEnum.String)
    cb_step1 = CallbackStep(
        name="MyCallbackStep1",
        depends_on=["TestStep"],
        sqs_queue_url=
        "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={"arg1": "foo"},
        outputs=[outputParam1],
    )
    cb_step2 = CallbackStep(
        name="MyCallbackStep2",
        depends_on=["TestStep"],
        sqs_queue_url=
        "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
        inputs={"arg1": cb_step1.properties.Outputs["output1"]},
        outputs=[outputParam2],
    )

    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[cb_step1, cb_step2],
        sagemaker_session=sagemaker_session_mock,
    )

    assert json.loads(pipeline.definition()) == {
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "PipelineExperimentConfig": {
            "ExperimentName": {
                "Get": "Execution.PipelineName"
            },
            "TrialName": {
                "Get": "Execution.PipelineExecutionId"
            },
        },
        "Steps": [
            {
                "Name":
                "MyCallbackStep1",
                "Type":
                "Callback",
                "Arguments": {
                    "arg1": "foo"
                },
                "DependsOn": ["TestStep"],
                "SqsQueueUrl":
                "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
                "OutputParameters": [{
                    "OutputName": "output1",
                    "OutputType": "String"
                }],
            },
            {
                "Name":
                "MyCallbackStep2",
                "Type":
                "Callback",
                "Arguments": {
                    "arg1": {
                        "Get":
                        "Steps.MyCallbackStep1.OutputParameters['output1']"
                    }
                },
                "DependsOn": ["TestStep"],
                "SqsQueueUrl":
                "https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue",
                "OutputParameters": [{
                    "OutputName": "output2",
                    "OutputType": "String"
                }],
            },
        ],
    }
def test_training_job_with_debugger_and_profiler(
    sagemaker_session,
    pipeline_name,
    role,
    pytorch_training_latest_version,
    pytorch_training_latest_py_version,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")

    rules = [
        Rule.sagemaker(rule_configs.vanishing_gradient()),
        Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ]
    debugger_hook_config = DebuggerHookConfig(
        s3_output_path=(f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors")
    )

    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    script_path = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    pytorch_estimator = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=pytorch_training_latest_version,
        py_version=pytorch_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        rules=rules,
        debugger_hook_config=debugger_hook_config,
    )

    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[step_train],
        sagemaker_session=sagemaker_session,
    )

    for _ in retries(
        max_retry_count=5,
        exception_message_prefix="Waiting for a successful execution of pipeline",
        seconds_to_sleep=10,
    ):
        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=10, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            failure_reason = execution_steps[0].get("FailureReason", "")
            if failure_reason != "":
                logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..")
                continue
            assert execution_steps[0]["StepName"] == "pytorch-train"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
            job_description = sagemaker_session.sagemaker_client.describe_training_job(
                TrainingJobName=training_job_arn.split("/")[1]
            )

            for index, rule in enumerate(rules):
                config = job_description["DebugRuleConfigurations"][index]
                assert config["RuleConfigurationName"] == rule.name
                assert config["RuleEvaluatorImage"] == rule.image_uri
                assert config["VolumeSizeInGB"] == 0
                assert (
                    config["RuleParameters"]["rule_to_invoke"]
                    == rule.rule_parameters["rule_to_invoke"]
                )
            assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()

            assert job_description["ProfilingStatus"] == "Enabled"
            assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
            break
        finally:
            try:
                pipeline.delete()
            except Exception:
                pass
def test_training_step_with_output_path_as_join(
    sagemaker_session, role, tf_full_version, tf_full_py_version, pipeline_name, region_name
):
    base_dir = os.path.join(DATA_DIR, "dummy_tensor")
    input_path = sagemaker_session.upload_data(
        path=base_dir, key_prefix="integ-test-data/estimator/training"
    )
    inputs = TrainingInput(s3_data=input_path)

    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
    output_path = Join(
        on="/", values=["s3:/", f"{sagemaker_session.default_bucket()}", f"{pipeline_name}Train"]
    )

    image_uri = image_uris.retrieve("factorization-machines", sagemaker_session.boto_region_name)
    estimator = Estimator(
        image_uri=image_uri,
        role=role,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        output_path=output_path,
    )
    estimator.set_hyperparameters(
        num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier"
    )
    step_train = TrainingStep(
        name="MyTrain",
        estimator=estimator,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[step_train],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )
        try:
            execution.wait(delay=30, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "MyTrain"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Beispiel #4
0
def test_pipeline_two_step(sagemaker_session_mock):
    parameter = ParameterString("MyStr")
    step1 = CustomStep(
        name="MyStep1",
        input_data=[
            parameter,  # parameter reference
            ExecutionVariables.PIPELINE_EXECUTION_ID,  # execution variable
            PipelineExperimentConfigProperties.
            EXPERIMENT_NAME,  # experiment config property
        ],
    )
    step2 = CustomStep(name="MyStep2",
                       input_data=[step1.properties.S3Uri])  # step property
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[parameter],
        steps=[step1, step2],
        sagemaker_session=sagemaker_session_mock,
    )
    assert pipeline.to_request() == {
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "PipelineExperimentConfig": {
            "ExperimentName": ExecutionVariables.PIPELINE_NAME,
            "TrialName": ExecutionVariables.PIPELINE_EXECUTION_ID,
        },
        "Steps": [
            {
                "Name": "MyStep1",
                "Type": "Training",
                "Arguments": {
                    "input_data": [
                        parameter,
                        ExecutionVariables.PIPELINE_EXECUTION_ID,
                        PipelineExperimentConfigProperties.EXPERIMENT_NAME,
                    ]
                },
            },
            {
                "Name": "MyStep2",
                "Type": "Training",
                "Arguments": {
                    "input_data": [step1.properties.S3Uri]
                },
            },
        ],
    }
    assert ordered(json.loads(pipeline.definition())) == ordered({
        "Version":
        "2020-12-01",
        "Metadata": {},
        "Parameters": [{
            "Name": "MyStr",
            "Type": "String"
        }],
        "PipelineExperimentConfig": {
            "ExperimentName": {
                "Get": "Execution.PipelineName"
            },
            "TrialName": {
                "Get": "Execution.PipelineExecutionId"
            },
        },
        "Steps": [
            {
                "Name": "MyStep1",
                "Type": "Training",
                "Arguments": {
                    "input_data": [
                        {
                            "Get": "Parameters.MyStr"
                        },
                        {
                            "Get": "Execution.PipelineExecutionId"
                        },
                        {
                            "Get": "PipelineExperimentConfig.ExperimentName"
                        },
                    ]
                },
            },
            {
                "Name": "MyStep2",
                "Type": "Training",
                "Arguments": {
                    "input_data": [{
                        "Get": "Steps.MyStep1.S3Uri"
                    }]
                },
            },
        ],
    })
def test_ppl_var_to_string_and_add(sagemaker_session, role, pipeline_name):
    param_str = ParameterString(name="MyString", default_value="1")
    param_int = ParameterInteger(name="MyInteger", default_value=3)

    cond = ConditionGreaterThan(left=param_str, right=param_int.to_string())
    step_cond = ConditionStep(
        name="CondStep",
        conditions=[cond],
        if_steps=[],
        else_steps=[],
    )
    join_fn1 = Join(
        on=" ",
        values=[
            "condition greater than check return:",
            step_cond.properties.Outcome.to_string(),
            "and left side param str is",
            param_str,
            "and right side param int is",
            param_int,
        ],
    )

    step_fail = FailStep(
        name="FailStep",
        error_message=join_fn1,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[param_str, param_int],
        steps=[step_cond, step_fail],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        pipeline_arn = response["PipelineArn"]
        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == pipeline_arn

        try:
            execution.wait(delay=30, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 2
        for execution_step in execution_steps:
            if execution_step["StepName"] == "CondStep":
                assert execution_step["StepStatus"] == "Succeeded"
                continue
            assert execution_step["StepName"] == "FailStep"
            assert execution_step["StepStatus"] == "Failed"
            assert (
                execution_step["FailureReason"] ==
                "condition greater than check return: false "
                "and left side param str is 1 and right side param int is 3")

        # Update int param to update cond step outcome
        execution = pipeline.start(parameters={"MyInteger": 0})
        try:
            execution.wait(delay=30, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 2
        for execution_step in execution_steps:
            if execution_step["StepName"] == "CondStep":
                assert execution_step["StepStatus"] == "Succeeded"
                continue
            assert execution_step["StepName"] == "FailStep"
            assert execution_step["StepStatus"] == "Failed"
            assert (
                execution_step["FailureReason"] ==
                "condition greater than check return: true "
                "and left side param str is 1 and right side param int is 0")
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Beispiel #6
0
def get_pipeline(
    region,
    project_name=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        processing_role: IAM role to create and run processing steps
        training_role: IAM role to create and run training steps
        data_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """

    # Dynamically load environmental SSM parameters - provide the list of the variables to load from SSM parameter store
    ssm_parameters = [
        {"VariableName":"DataBucketName", "ParameterName":"data-bucket-name"},
        {"VariableName":"ModelBucketName", "ParameterName":"model-bucket-name"},
        {"VariableName":"S3KmsKeyId", "ParameterName":"kms-s3-key-arn"},
        {"VariableName":"EbsKmsKeyArn", "ParameterName":"kms-ebs-key-arn"},
    ]

    env_data = get_environment(project_name=project_name, ssm_params=ssm_parameters)
    print(f"Environment data:\n{json.dumps(env_data, indent=2)}")

    security_group_ids = env_data["SecurityGroups"]
    subnets = env_data["SubnetIds"]
    processing_role = env_data["ExecutionRole"]
    training_role = env_data["ExecutionRole"]
    data_bucket = env_data["DataBucketName"]
    model_bucket = env_data["ModelBucketName"]
    ebs_kms_id = env_data["EbsKmsKeyArn"]
    s3_kms_id = env_data["S3KmsKeyId"]

    sagemaker_session = get_session(region, data_bucket)

    if processing_role is None:
        processing_role = sagemaker.session.get_execution_role(sagemaker_session)
    if training_role is None:
        training_role = sagemaker.session.get_execution_role(sagemaker_session)
    if model_bucket is None:
        model_bucket = sagemaker_session.default_bucket()


    print(f"Creating the pipeline '{pipeline_name}':")
    print(f"Parameters:{region}\n{security_group_ids}\n{subnets}\n{processing_role}\n\
    {training_role}\n{data_bucket}\n{model_bucket}\n{model_package_group_name}\n\
    {pipeline_name}\n{base_job_prefix}")

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval"
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"s3://{sagemaker_session.default_bucket()}/datasets/abalone-dataset.csv",
    )

    # configure network for encryption, network isolation and VPC configuration
    # Since the preprocessor job takes the data from S3, enable_network_isolation must be set to False
    # see https://github.com/aws/amazon-sagemaker-examples/issues/1689
    network_config = NetworkConfig(
        enable_network_isolation=False, 
        security_group_ids=security_group_ids,
        subnets=subnets,
        encrypt_inter_container_traffic=True)
    
    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-abalone-preprocess",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config,
        volume_kms_key=ebs_kms_id,
        output_kms_key=s3_kms_id
    )
    
    step_process = ProcessingStep(
        name="PreprocessAbaloneData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{model_bucket}/{base_job_prefix}/AbaloneTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/abalone-train",
        sagemaker_session=sagemaker_session,
        role=training_role,
        subnets=network_config.subnets,
        security_group_ids=network_config.security_group_ids,
        encrypt_inter_container_traffic=True,
        enable_network_isolation=False,
        volume_kms_key=ebs_kms_id,
        output_kms_key=s3_kms_id
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    
    step_train = TrainingStep(
        name="TrainAbaloneModel",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-abalone-eval",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config,
        volume_kms_key=ebs_kms_id,
        output_kms_key=s3_kms_id
    )
    
    evaluation_report = PropertyFile(
        name="AbaloneEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateAbaloneModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    """
    There is a bug in RegisterModel implementation
    The RegisterModel step is implemented in the SDK as two steps, a _RepackModelStep and a _RegisterModelStep. 
    The _RepackModelStep runs a SKLearn training step in order to repack the model.tar.gz to include any custom inference code in the archive. 
    The _RegisterModelStep then registers the repacked model.
    
    The problem is that the _RepackModelStep does not propagate VPC configuration from the Estimator object:
    https://github.com/aws/sagemaker-python-sdk/blob/cdb633b3ab02398c3b77f5ecd2c03cdf41049c78/src/sagemaker/workflow/_utils.py#L88

    This cause the AccessDenied exception because repacker cannot access S3 bucket (all access which is not via VPC endpoint is bloked by the bucket policy)
    
    The issue is opened against SageMaker python SDK: https://github.com/aws/sagemaker-python-sdk/issues/2302
    """

    vpc_config = {
        "Subnets":network_config.subnets,
        "SecurityGroupIds":network_config.security_group_ids
    }

    step_register = RegisterModel(
        name="RegisterAbaloneModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
        vpc_config_override=vpc_config
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value"
        ),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSEAbaloneEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline