def test_workflow_input_placeholder():

    workflow_input = ExecutionInput()
    test_step = Pass(state_id="StateOne",
                     parameters={
                         "ParamA": "SampleValueA",
                         "ParamB": workflow_input,
                         "ParamC": workflow_input["Key01"],
                         "ParamD": workflow_input["Key02"]["Key03"],
                         "ParamE": workflow_input["Key01"]["Key03"],
                     })

    expected_repr = {
        "Type": "Pass",
        "Parameters": {
            "ParamA": "SampleValueA",
            "ParamB.$": "$$.Execution.Input",
            "ParamC.$": "$$.Execution.Input['Key01']",
            "ParamD.$": "$$.Execution.Input['Key02']['Key03']",
            "ParamE.$": "$$.Execution.Input['Key01']['Key03']"
        },
        "End": True
    }

    assert test_step.to_dict() == expected_repr
Example #2
0
def test_step_input_order_validation():
    workflow_input = ExecutionInput()

    test_step_01 = Pass(state_id='StateOne',
                        parameters={
                            'ParamA': workflow_input['Key02']['Key03'],
                            'ParamD': workflow_input['Key01']['Key03'],
                        })

    test_step_02 = Pass(state_id='StateTwo',
                        parameters={
                            'ParamC': workflow_input["Key05"],
                            "ParamB": "SampleValueB",
                            "ParamE":
                            test_step_01.output()["Response"]["Key04"]
                        })

    test_step_03 = Pass(state_id='StateThree',
                        parameters={
                            'ParamG': "SampleValueG",
                            "ParamF": workflow_input["Key06"],
                            "ParamH": "SampleValueH"
                        })

    workflow_definition = Chain([test_step_01, test_step_03, test_step_02])

    with pytest.raises(ValueError):
        result = Graph(workflow_definition).to_dict()
Example #3
0
def workflow(client):
    execution_input = ExecutionInput()

    test_step_01 = Pass(state_id='StateOne',
                        parameters={
                            'ParamA': execution_input['Key02']['Key03'],
                            'ParamD': execution_input['Key01']['Key03'],
                        })

    test_step_02 = Pass(state_id='StateTwo',
                        parameters={
                            'ParamC': execution_input["Key05"],
                            "ParamB": "SampleValueB",
                            "ParamE":
                            test_step_01.output()["Response"]["Key04"]
                        })

    test_step_03 = Pass(state_id='StateThree',
                        parameters={
                            'ParamG': "SampleValueG",
                            "ParamF": execution_input["Key06"],
                            "ParamH": "SampleValueH",
                            "ParamI": test_step_02.output()
                        })

    workflow_definition = Chain([test_step_01, test_step_02, test_step_03])
    workflow = Workflow(name='TestWorkflow',
                        definition=workflow_definition,
                        role='testRoleArn',
                        execution_input=execution_input,
                        client=client)
    return workflow
Example #4
0
def test_workflow_input_placeholder():

    workflow_input = ExecutionInput()
    test_step = Pass(state_id='StateOne',
                     parameters={
                         'ParamA': 'SampleValueA',
                         'ParamB': workflow_input,
                         'ParamC': workflow_input['Key01'],
                         'ParamD': workflow_input['Key02']['Key03'],
                         'ParamE': workflow_input['Key01']['Key03'],
                     })

    expected_repr = {
        "Type": "Pass",
        "Parameters": {
            "ParamA": "SampleValueA",
            "ParamB.$": "$$.Execution.Input",
            "ParamC.$": "$$.Execution.Input['Key01']",
            "ParamD.$": "$$.Execution.Input['Key02']['Key03']",
            "ParamE.$": "$$.Execution.Input['Key01']['Key03']"
        },
        "End": True
    }

    assert test_step.to_dict() == expected_repr
 def add_execution_input(self, unique_name: str) -> None:
     logger.debug(f"adding execution input for {unique_name}")
     if unique_name in self.execution_input_schema:
         raise DataJobSagemakerException(
             f"The entry {unique_name} already exists in the execution input."
         )
     self.execution_input_schema[unique_name] = str
     self.execution_input = ExecutionInput(
         schema=self.execution_input_schema)
def test_model_step_with_placeholders(trained_estimator, sfn_client,
                                      sagemaker_session, sfn_role_arn):
    # Build workflow definition
    execution_input = ExecutionInput(schema={
        'ModelName': str,
        'Mode': str,
        'Tags': list
    })

    parameters = {
        'PrimaryContainer': {
            'Mode': execution_input['Mode']
        },
        'Tags': execution_input['Tags']
    }

    model_step = ModelStep('create_model_step',
                           model=trained_estimator.create_model(),
                           model_name=execution_input['ModelName'],
                           parameters=parameters)
    model_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
    workflow_graph = Chain([model_step])

    with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
        # Create workflow and check definition
        workflow = create_workflow_and_check_definition(
            workflow_graph=workflow_graph,
            workflow_name=unique_name_from_base(
                "integ-test-model-step-workflow"),
            sfn_client=sfn_client,
            sfn_role_arn=sfn_role_arn)

        inputs = {
            'ModelName': generate_job_name(),
            'Mode': 'SingleModel',
            'Tags': [{
                'Key': 'Environment',
                'Value': 'test'
            }]
        }

        # Execute workflow
        execution = workflow.execute(inputs=inputs)
        execution_output = execution.get_output(wait=True)

        # Check workflow output
        assert execution_output.get("ModelArn") is not None
        assert execution_output["SdkHttpMetadata"]["HttpStatusCode"] == 200

        # Cleanup
        state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
        model_name = get_resource_name_from_arn(
            execution_output.get("ModelArn")).split("/")[1]
        delete_sagemaker_model(model_name, sagemaker_session)
Example #7
0
def test_placeholder_make_immutable():
    workflow_input = ExecutionInput()
    workflow_input["A"]["b"].get("C", float)
    workflow_input["Message"]
    workflow_input["Key01"]["Key02"]
    workflow_input["Key03"]
    workflow_input["Key03"]["Key04"]

    assert check_immutable(workflow_input) == False

    workflow_input._make_immutable()
    assert check_immutable(workflow_input) == True
def test_map_state_with_placeholders():
    workflow_input = ExecutionInput()
    step_result = StepResult()

    map_state = Map(state_id="MapState01",
                    result_selector={
                        "foo": step_result["foo"],
                        "bar": step_result["bar1"]["bar2"]
                    })
    iterator_state = Pass("TrainIterator",
                          parameters={
                              "ParamA": map_state.output()["X"]["Y"],
                              "ParamB":
                              workflow_input["Key01"]["Key02"]["Key03"]
                          })

    map_state.attach_iterator(iterator_state)
    workflow_definition = Chain([map_state])

    expected_repr = {
        "StartAt": "MapState01",
        "States": {
            "MapState01": {
                "Type": "Map",
                "ResultSelector": {
                    "foo.$": "$['foo']",
                    "bar.$": "$['bar1']['bar2']"
                },
                "End": True,
                "Iterator": {
                    "StartAt": "TrainIterator",
                    "States": {
                        "TrainIterator": {
                            "Parameters": {
                                "ParamA.$":
                                "$['X']['Y']",
                                "ParamB.$":
                                "$$.Execution.Input['Key01']['Key02']['Key03']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }
            }
        }
    }

    result = Graph(workflow_definition).to_dict()
    assert result == expected_repr
Example #9
0
def test_map_state_with_placeholders():
    workflow_input = ExecutionInput()

    map_state = Map('MapState01')
    iterator_state = Pass('TrainIterator',
                          parameters={
                              'ParamA': map_state.output()['X']["Y"],
                              'ParamB':
                              workflow_input["Key01"]["Key02"]["Key03"]
                          })

    map_state.attach_iterator(iterator_state)
    workflow_definition = Chain([map_state])

    expected_repr = {
        "StartAt": "MapState01",
        "States": {
            "MapState01": {
                "Type": "Map",
                "End": True,
                "Iterator": {
                    "StartAt": "TrainIterator",
                    "States": {
                        "TrainIterator": {
                            "Parameters": {
                                "ParamA.$":
                                "$['X']['Y']",
                                "ParamB.$":
                                "$$.Execution.Input['Key01']['Key02']['Key03']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }
            }
        }
    }

    result = Graph(workflow_definition).to_dict()
    assert result == expected_repr
Example #10
0
def test_placeholder_with_schema():
    test_schema = {
        "A": {
            "B":{
                "C": int
            }
        },
        "Request": {
            "Status": str
        },
        "Hello": float
    }
    workflow_input = ExecutionInput(schema=test_schema)
    assert workflow_input.get_schema_as_dict() == test_schema
    assert workflow_input.immutable == True

    with pytest.raises(ValueError):
        workflow_input["A"]["B"]["D"]
    
    with pytest.raises(ValueError):
        workflow_input["A"]["B"].get("C", float)
Example #11
0
def test_placeholder_schema_as_dict():
    workflow_input = ExecutionInput()
    workflow_input["A"]["b"].get("C", float)
    workflow_input["Message"]
    workflow_input["Key01"]["Key02"]
    workflow_input["Key03"]
    workflow_input["Key03"]["Key04"]

    expected_schema = {
        "A": {
            "b": {
                "C": float
            }
        },
        "Message": str,
        "Key01": {
            "Key02": str
        },
        "Key03": {
            "Key04": str
        }
    }

    assert workflow_input.get_schema_as_dict() == expected_schema
Example #12
0
                                    output_path='s3://{}/{}/output'.format(bucket, project_name))

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        eval_metric='error',
                        num_round=100)


# Build out the workflow
execution_input = ExecutionInput(schema={
    'TrainingJobName': str,
    'ModelName': str
})

etl_step = steps.GlueStartJobRunStep(
    'Extract, Transform, Load',
    parameters={"JobName": job_name,
                "Arguments":{
                    '--S3_SOURCE': data_source,
                    '--S3_DEST': 's3a://{}/{}/'.format(bucket, project_name),
                    '--TRAIN_KEY': train_prefix + '/',
                    '--VAL_KEY': val_prefix +'/'}
               }
)


training_step = steps.TrainingStep(
Example #13
0
    parser.add_argument('--batch_job_definition', type=str, default=os.environ['BATCH_JOB_DEFINITION'])
    parser.add_argument('--batch_job_name', type=str, default=os.environ['BATCH_JOB_NAME'])
    parser.add_argument('--batch_job_queue', type=str, default=os.environ['BATCH_JOB_QUEUE'])
    parser.add_argument('--train_url', type=str, default=os.environ['TRAIN_URL'])
    parser.add_argument('--data_path', type=str, default=os.environ['DATA_PATH'])
    parser.add_argument('--batch_size', type=str, default=os.environ['BATCH_SIZE'])
    parser.add_argument('--epoch', type=str, default=os.environ['EPOCH'])
    args = parser.parse_args()


    # SFn の実行に必要な情報を渡す際のスキーマを定義します
    execution_input = ExecutionInput(schema={
        # AWS Batch
        'BatchJobDefinition': str,
        'BatchJobName': str,
        'BatchJobQueue': str,

        # SageMaker
        'TrainJobName': str,
        }
    )

    # SFn のワークフローの定義を記載します
    inputs={
        # AWS Batch
        'BatchJobDefinition': args.batch_job_definition,
        'BatchJobName': args.batch_job_name,
        'BatchJobQueue': args.batch_job_queue,

        # SageMaker Training
        'TrainJobName': TRAINING_JOB_NAME
        }
Example #14
0
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow

stepfunctions.set_stream_logger(level=logging.INFO)
id = uuid.uuid4().hex

FLOW_NAME = 'active_learning_flow_{}'.format(id)
WORKFLOW_ROLE = 'ROLE ARN'

if __name__ == '__main__':
    # SFn の実行に必要な情報を渡す際のスキーマを定義します
    execution_input = ExecutionInput(
        schema={
            # AWS Batch
            'BatchJobDefinition': str,
            'BatchJobName': str,
            'BatchJobQueue': str,

            # AWS Lambda
            'LambdaFunctionName': str,
        })

    # SFn のワークフローの定義を記載します
    inputs = {
        # AWS Batch
        'BatchJobDefinition': 'active-learning-job_run:1',
        'BatchJobName': 'active-learning-inference',
        'BatchJobQueue': 'active-learning-inference',

        # AWS Lambda
        'LambdaFunctionName': 'create_labeling_job'
    }
Example #15
0
def setup_workflow(project, purpose, workflow_execution_role, script_dir,
                   ecr_repository):
    """ to setup all needed for a step function with sagemaker.
    arg: 
        project: project name under sagemaker
        purpose: subproject
        workflow_execution_role: arn to execute step functions
        script_dir: processing file name, like a .py file
        ecr_repository: ecr repository name
    return:
        workflow: a stepfunctions.workflow.Workflow instance  
    example: 
        PROJECT = '[dpt-proj-2022]'
        PURPOSE = '[processing]'
        WORKFLOW_EXECUTION_ROLE = "arn:aws-cn:iam::[*********]:role/[**************]"
        SCRIPT_DIR = "[processing].py"
        ECR_REPOSITORY = '[ecr-2022]'
    """

    # SageMaker Session setup
    # ========================================================================================
    # SageMaker Session
    # ====================================
    account_id = boto3.client('sts').get_caller_identity().get('Account')
    role = sagemaker.get_execution_role()

    # Storage
    # ====================================
    session = sagemaker.Session()
    region = session.boto_region_name
    s3_output = session.default_bucket()

    # Code storage
    # ==================
    s3_prefix = '{}/{}'.format(project, purpose)
    s3_prefix_code = '{}/code'.format(s3_prefix)
    s3CodePath = 's3://{}/{}/code'.format(s3_output, s3_prefix)

    ## preprocess & prediction
    script_list = [script_dir]

    for script in script_list:
        session.upload_data(script,
                            bucket=session.default_bucket(),
                            key_prefix=s3_prefix_code)

    # ECR environment
    # ====================================
    uri_suffix = 'amazonaws.com.cn'
    tag = ':latest'
    ecr_repository_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region,
                                                      uri_suffix,
                                                      ecr_repository + tag)

    # SageMaker Experiments setup
    # ========================================================================================
    experiment = Experiment.create(
        experiment_name="{}-{}".format(project, int(time.time())),
        description="machine learning project",
        sagemaker_boto_client=boto3.client('sagemaker'))
    print(experiment)

    execution_input = ExecutionInput(schema={
        "ProcessingJobName": str,
        "ResultPath": str,
    })

    # setup script processor
    script_processor = ScriptProcessor(command=['python3'],
                                       image_uri=ecr_repository_uri,
                                       role=role,
                                       instance_count=1,
                                       instance_type='ml.m5.4xlarge')

    # Step
    # ========================================================================================

    optimizing_step = steps.ProcessingStep(
        "Processing Step",
        processor=script_processor,
        job_name=execution_input["ProcessingJobName"],
        inputs=[
            ProcessingInput(source=s3CodePath,
                            destination='/opt/ml/processing/input/code',
                            input_name='code')
        ],
        outputs=[
            ProcessingOutput(output_name=purpose,
                             destination=execution_input["ResultPath"],
                             source='/opt/ml/processing/{}'.format(purpose))
        ],
        container_entrypoint=[
            "python3", "/opt/ml/processing/input/code/" + script_dir
        ],
    )

    # Fail Sate
    # ========================================================================================
    failed_state = steps.states.Fail("Processing Workflow failed",
                                     cause="SageMakerProcessingJobFailed")

    catch_state_processing = steps.states.Catch(
        error_equals=["States.TaskFailed"], next_step=failed_state)

    # Create Workflow
    # ========================================================================================
    optimizing_step.add_catch(catch_state_processing)

    workflow_name = workflow_name = "workflow-{}-{}".format(project,
                                                            purpose).upper()
    workflow_graph = steps.Chain([optimizing_step])

    workflow = Workflow(name=workflow_name,
                        definition=workflow_graph,
                        role=workflow_execution_role)

    workflow.create()
    return workflow
Example #16
0
from stepfunctions.template.utils import replace_parameters_with_jsonpath

stepfunctions.set_stream_logger(level=logging.INFO)

region = boto3.Session().region_name

model_namea = f"DEMO-decission-tree-pred-{datetime.now():%Y-%m-%d-%H-%M-%S}"
model_nameb = f"DEMO-random-forest-pred-{datetime.now():%Y-%m-%d-%H-%M-%S}"

# Create a schema for input
event_input = ExecutionInput(schema={
    'BuildId': str,
    'ModelA': str,
    'ModelB': str,
    'Endpoint': str,
    'ecrArnA': str,
    'ecrArnB': str,
    'dataBucketPath': str,
    'authorDate': str,
    'triggerSource': str,
    'commitId': str,
})

# Define static variables determined by appsec
sagemaker_role = 'arn:aws:iam::860660749434:role/qls-28583-e80f1ff13e6e273a-SageMakerRole-ND1XCTEJG4JM'
workflow_role = 'arn:aws:iam::860660749434:role/qls-28583-e80f1ff13e6e273a-StepFunctionsRole-1873OQ5BK2E8U'
ecr_ArnA = 'latesta'
ecr_ArnB = 'latestb'
state_machine_arn = 'arn:aws:states:us-west-2:860660749434:stateMachine:trainingStateMachine-qxyULJR6C733'
state_machine_name = 'trainingStateMachine-qxyULJR6C733'
dynamoDBTable = 'qls-28583-e80f1ff13e6e273a-DynamoDBTable-13I0WGPSJZZVI'
endpoint_wait_lambda = 'arn:aws:lambda:us-west-2:860660749434:function:qls-28583-e80f1ff13e6e273a-endpointWaitLambda-dpyAW80Wkrh3'
Example #17
0
def define_training_pipeline(
    sm_role,
    workflow_execution_role,
    training_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_training_pipeline.yaml",
    kms_key_id=None,
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                                it returns an instance of
                                    `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:             If not None, a YAML file will be generated at
                                    this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "InputDataURL": str,
            "PreprocessingJobName": str,
            "PreprocessingCodeURL": str,
            "TrainingJobName": str,
            # Prevent sagemaker config hardcode sagemaker_submit_directory in
            # workflow definition
            "SMSubmitDirURL": str,
            # Prevent sagemaker config hardcode sagemaker_region in workflow definition
            "SMRegion": str,
            "EvaluationProcessingJobName": str,
            "EvaluationCodeURL": str,
            "EvaluationResultURL": str,
            "PreprocessedTrainDataURL": str,
            "PreprocessedTestDataURL": str,
            "PreprocessedModelURL": str,
            "SMOutputDataURL": str,
            "SMDebugOutputURL": str,
        })
    """
    Data pre-processing and feature engineering
    """
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )

    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["InputDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/train",
            destination=execution_input["PreprocessedTrainDataURL"],
            output_name="train_data",
        ),
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["PreprocessedTestDataURL"],
            output_name="test_data",
        ),
        ProcessingOutput(
            source="/opt/ml/processing/model",
            destination=execution_input["PreprocessedModelURL"],
            output_name="proc_model",
        ),
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing step",
        processor=sklearn_processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=[
            "--train-test-split-ratio", "0.2", "--mode", "train"
        ],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
        kms_key_id=kms_key_id,
    )
    """
    Training using the pre-processed data
    """
    sklearn = SKLearn(
        entry_point="../../src/mlmax/train.py",
        train_instance_type="ml.m5.xlarge",
        role=sm_role,
        py_version="py3",
        framework_version="0.20.0",
        output_kms_key=kms_key_id,
    )

    training_step = MLMaxTrainingStep(
        "SageMaker Training Step",
        estimator=sklearn,
        job_name=execution_input["TrainingJobName"],
        train_data=execution_input["PreprocessedTrainDataURL"],
        test_data=execution_input["PreprocessedTestDataURL"],
        sm_submit_url=execution_input["SMSubmitDirURL"],
        sm_region=execution_input["SMRegion"],
        sm_output_data=execution_input["SMOutputDataURL"],
        sm_debug_output_data=execution_input["SMDebugOutputURL"],
        wait_for_completion=True,
    )
    """
    Model evaluation
    """
    # Create input and output objects for Model Evaluation ProcessingStep.
    inputs_evaluation = [
        ProcessingInput(
            source=execution_input["PreprocessedTestDataURL"],
            destination="/opt/ml/processing/test",
            input_name="input-1",
        ),
        ProcessingInput(
            source=training_step.get_expected_model().model_data,
            destination="/opt/ml/processing/model",
            input_name="input-2",
        ),
        ProcessingInput(
            source=execution_input["EvaluationCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs_evaluation = [
        ProcessingOutput(
            source="/opt/ml/processing/evaluation",
            destination=execution_input["EvaluationResultURL"],
            output_name="evaluation",
        ),
    ]

    model_evaluation_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )

    processing_evaluation_step = ProcessingStep(
        "SageMaker Processing Model Evaluation step",
        processor=model_evaluation_processor,
        job_name=execution_input["EvaluationProcessingJobName"],
        inputs=inputs_evaluation,
        outputs=outputs_evaluation,
        container_entrypoint=[
            "python3", "/opt/ml/processing/input/code/evaluation.py"
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed")

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )
    processing_step.add_catch(catch_state_processing)
    processing_evaluation_step.add_catch(catch_state_processing)
    training_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain(
        [processing_step, training_step, processing_evaluation_step])
    training_pipeline = Workflow(
        name=training_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return training_pipeline
Example #18
0
def test_placeholder_path():
    workflow_input = ExecutionInput()
    placeholder_variable = workflow_input["A"]["b"]["C"]
    expected_path = ["A", "b", "C"]
    assert placeholder_variable._get_path() == expected_path
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow
from time import gmtime, strftime
from sagemaker.model_monitor import DataCaptureConfig
import utils

sagemaker_session = sagemaker.Session()
sagemaker_exec_role = utils.get_sagemaker_execution_role()
sfn_client = client('stepfunctions')
# define execution input
execution_input = ExecutionInput(
    schema={
        'AutoMLJobName': str,
        'ModelName': str,
        'S3InputData': str,
        'IamRole': str,
        'TargetColumnName': str,
        'S3OutputData': str,
        'Tags': dict,
        'EndpointName': str,
        'EndpointConfigName': str
    })

# TODO: make this a notification
workflow_failure = Fail('WorkflowFailed')

# create autopilot lambda step
create_autopilot_job_step = LambdaStep(
    'StartAutopilotJob',
    parameters={
        'FunctionName': 'CreateAutopilotJob',
        'Payload': {
Example #20
0
def define_data_pipeline(
    sm_role,
    workflow_execution_role,
    data_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_data_pipeline.yaml",
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                                it returns an instance of
                                    `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:             If not None, a YAML file will be generated at
                                    this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "PreprocessingJobName": str,
            "PreprocessingCodeURL": str,
            "PreprocessedOutputDataURL": str,
            "S3InputPath": str,
            "S3OutputPath": str,
        }
    )

    """
    Data pre-processing and feature engineering
    """
    # processor = PySparkProcessor(
    region = "ap-southeast-1"
    image = "sagemaker-spark-processing"
    img_uri = f"759080221371.dkr.ecr.{region}.amazonaws.com/{image}:2.4-cpu"
    processor = ScriptProcessor(
        image_uri=img_uri,
        role=sm_role,
        instance_count=16,
        instance_type="ml.m5.2xlarge",
        command=["/opt/program/submit"],
        max_runtime_in_seconds=3600,
        env={"mode": "python"},
    )

    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=execution_input["PreprocessedOutputDataURL"],
            output_name="processed_data",
        ),
    ]

    processing_step = MLMAXProcessingStep(
        "SageMaker pre-processing step",
        processor=processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        environment={
            "S3InputPath": execution_input["S3InputPath"],
            "S3OutputPath": execution_input["S3OutputPath"],
        },
        container_entrypoint=[
            "smspark-submit",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed"
    )

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )
    processing_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain([processing_step])
    data_pipeline = Workflow(
        name=data_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return data_pipeline
def test_parallel_state_with_placeholders():
    workflow_input = ExecutionInput()
    step_result = StepResult()

    parallel_state = Parallel(state_id="ParallelState01",
                              result_selector={
                                  "foo": step_result["foo"],
                                  "bar": step_result["bar1"]["bar2"]
                              })

    branch_A = Pass("Branch_A",
                    parameters={
                        "ParamA": parallel_state.output()["A"]["B"],
                        "ParamB": workflow_input["Key01"]
                    })

    branch_B = Pass("Branch_B",
                    parameters={
                        "ParamA":
                        "TestValue",
                        "ParamB":
                        parallel_state.output()["Response"]["Key"]["State"]
                    })

    branch_C = Pass("Branch_C",
                    parameters={
                        "ParamA":
                        parallel_state.output()["A"]["B"].get("C", float),
                        "ParamB": "HelloWorld"
                    })

    parallel_state.add_branch(branch_A)
    parallel_state.add_branch(branch_B)
    parallel_state.add_branch(branch_C)

    workflow_definition = Chain([parallel_state])
    result = Graph(workflow_definition).to_dict()

    expected_repr = {
        "StartAt": "ParallelState01",
        "States": {
            "ParallelState01": {
                "Type":
                "Parallel",
                "ResultSelector": {
                    "foo.$": "$['foo']",
                    "bar.$": "$['bar1']['bar2']"
                },
                "End":
                True,
                "Branches": [{
                    "StartAt": "Branch_A",
                    "States": {
                        "Branch_A": {
                            "Parameters": {
                                "ParamA.$": "$['A']['B']",
                                "ParamB.$": "$$.Execution.Input['Key01']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }, {
                    "StartAt": "Branch_B",
                    "States": {
                        "Branch_B": {
                            "Parameters": {
                                "ParamA": "TestValue",
                                "ParamB.$": "$['Response']['Key']['State']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }, {
                    "StartAt": "Branch_C",
                    "States": {
                        "Branch_C": {
                            "Parameters": {
                                "ParamA.$": "$['A']['B']['C']",
                                "ParamB": "HelloWorld"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }]
            }
        }
    }

    assert result == expected_repr
def test_training_step_creation_with_placeholders(pca_estimator):
    execution_input = ExecutionInput(schema={
        'Data': str,
        'OutputPath': str,
    })

    step_input = StepInput(schema={
        'JobName': str,
    })

    step = TrainingStep(
        'Training',
        estimator=pca_estimator,
        job_name=step_input['JobName'],
        data=execution_input['Data'],
        output_data_config_path=execution_input['OutputPath'],
        experiment_config={
            'ExperimentName': 'pca_experiment',
            'TrialName': 'pca_trial',
            'TrialComponentDisplayName': 'Training'
        },
        tags=DEFAULT_TAGS,
    )
    assert step.to_dict() == {
        'Type': 'Task',
        'Parameters': {
            'AlgorithmSpecification': {
                'TrainingImage': PCA_IMAGE,
                'TrainingInputMode': 'File'
            },
            'OutputDataConfig': {
                'S3OutputPath.$': "$$.Execution.Input['OutputPath']"
            },
            'StoppingCondition': {
                'MaxRuntimeInSeconds': 86400
            },
            'ResourceConfig': {
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge',
                'VolumeSizeInGB': 30
            },
            'RoleArn':
            EXECUTION_ROLE,
            'HyperParameters': {
                'feature_dim': '50000',
                'num_components': '10',
                'subtract_mean': 'True',
                'algorithm_mode': 'randomized',
                'mini_batch_size': '200'
            },
            'InputDataConfig': [{
                'ChannelName': 'training',
                'DataSource': {
                    'S3DataSource': {
                        'S3DataDistributionType': 'FullyReplicated',
                        'S3DataType': 'S3Prefix',
                        'S3Uri.$': "$$.Execution.Input['Data']"
                    }
                }
            }],
            'ExperimentConfig': {
                'ExperimentName': 'pca_experiment',
                'TrialName': 'pca_trial',
                'TrialComponentDisplayName': 'Training'
            },
            'TrainingJobName.$':
            "$['JobName']",
            'Tags':
            DEFAULT_TAGS_LIST
        },
        'Resource': 'arn:aws:states:::sagemaker:createTrainingJob.sync',
        'End': True
    }
def define_inference_pipeline(
    sm_role,
    workflow_execution_role,
    inference_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_inference_pipeline.yaml",
    kms_key_id=None,
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                     it returns an instance of `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:  If not None, a YAML file will be generated at this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "InputDataURL": str,
            "PreprocessingJobName": str,
            "InferenceJobName": str,
            "ProcModelS3": str,
            "PreprocessingCodeURL": str,
            "InferenceCodeURL": str,
            "ModelS3": str,
            "PreprocessedTrainDataURL": str,
            "PreprocessedTestDataURL": str,
            "OutputPathURL": str,
        })
    """
    Create Preprocessing Model from model artifact.
    """
    # sagemaker_session = sagemaker.Session()

    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )
    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["InputDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
        ProcessingInput(
            source=execution_input["ProcModelS3"],
            destination="/opt/ml/processing/model",
            input_name="proc_model",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["PreprocessedTestDataURL"],
            output_name="test_data",
        ),
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing step",
        processor=sklearn_processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=["--mode", "infer"],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
        kms_key_id=kms_key_id,
    )
    """
    Create inference with sklearn processing step.

    Inputs are the preprocessed data S3 URL, the inference code S3 URL, and
    the model S3 URL. Output is the inferred data.
    """
    sklearn_processor2 = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )
    inputs = [
        ProcessingInput(
            source=execution_input["PreprocessedTestDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["InferenceCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
        ProcessingInput(
            source=execution_input["ModelS3"],
            destination="/opt/ml/processing/model",
            input_name="model",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["OutputPathURL"],
            output_name="test_data",
        ),
    ]

    inference_step = ProcessingStep(
        "SageMaker inference step",
        processor=sklearn_processor2,
        job_name=execution_input["InferenceJobName"],
        inputs=inputs,
        outputs=outputs,
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/inference.py",
        ],
        kms_key_id=kms_key_id,
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed")

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )

    processing_step.add_catch(catch_state_processing)
    inference_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain([processing_step, inference_step])
    inference_pipeline = Workflow(
        name=inference_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return inference_pipeline
Example #24
0
def test_workflow_with_placeholders():
    workflow_input = ExecutionInput()

    test_step_01 = Pass(state_id='StateOne',
                        parameters={
                            'ParamA': workflow_input['Key02']['Key03'],
                            'ParamD': workflow_input['Key01']['Key03'],
                        })

    test_step_02 = Pass(state_id='StateTwo',
                        parameters={
                            'ParamC': workflow_input["Key05"],
                            "ParamB": "SampleValueB",
                            "ParamE":
                            test_step_01.output()["Response"]["Key04"]
                        })

    test_step_03 = Pass(state_id='StateThree',
                        parameters={
                            'ParamG': "SampleValueG",
                            "ParamF": workflow_input["Key06"],
                            "ParamH": "SampleValueH"
                        })

    workflow_definition = Chain([test_step_01, test_step_02, test_step_03])

    result = Graph(workflow_definition).to_dict()

    expected_workflow_repr = {
        "StartAt": "StateOne",
        "States": {
            "StateOne": {
                "Type": "Pass",
                "Parameters": {
                    "ParamA.$": "$$.Execution.Input['Key02']['Key03']",
                    "ParamD.$": "$$.Execution.Input['Key01']['Key03']"
                },
                "Next": "StateTwo"
            },
            "StateTwo": {
                "Type": "Pass",
                "Parameters": {
                    "ParamC.$": "$$.Execution.Input['Key05']",
                    "ParamB": "SampleValueB",
                    "ParamE.$": "$['Response']['Key04']"
                },
                "Next": "StateThree"
            },
            "StateThree": {
                "Type": "Pass",
                "Parameters": {
                    "ParamG": "SampleValueG",
                    "ParamF.$": "$$.Execution.Input['Key06']",
                    "ParamH": "SampleValueH"
                },
                "End": True
            }
        }
    }

    assert result == expected_workflow_repr
def main(
    git_branch,
    codebuild_id,
    pipeline_name,
    model_name,
    deploy_role,
    sagemaker_role,
    sagemaker_bucket,
    data_dir,
    output_dir,
    ecr_dir,
    kms_key_id,
    workflow_role_arn,
    notification_arn,
    sagemaker_project_id,
    tags,
):
    # Define the function names
    create_experiment_function_name = "mlops-create-experiment"
    query_training_function_name = "mlops-query-training"

    # Get the region
    region = boto3.Session().region_name
    print("region: {}".format(region))

    if ecr_dir:
        # Load the image uri and input data config
        with open(os.path.join(ecr_dir, "imageDetail.json"), "r") as f:
            image_uri = json.load(f)["ImageURI"]
    else:
        # Get the the managed image uri for current region
        image_uri = get_training_image(region)
    print("image uri: {}".format(image_uri))

    with open(os.path.join(data_dir, "inputData.json"), "r") as f:
        input_data = json.load(f)
        print("training uri: {}".format(input_data["TrainingUri"]))
        print("validation uri: {}".format(input_data["ValidationUri"]))
        print("baseline uri: {}".format(input_data["BaselineUri"]))

    # Get the job id and source revisions
    job_id = get_pipeline_execution_id(pipeline_name, codebuild_id)
    revisions = get_pipeline_revisions(pipeline_name, job_id)
    git_commit_id = revisions["ModelSourceOutput"]
    data_verison_id = revisions["DataSourceOutput"]
    print("job id: {}".format(job_id))
    print("git commit: {}".format(git_commit_id))
    print("data version: {}".format(data_verison_id))

    # Set the output Data
    output_data = {
        "ModelOutputUri":
        "s3://{}/{}".format(sagemaker_bucket, model_name),
        "BaselineOutputUri":
        f"s3://{sagemaker_bucket}/{model_name}/monitoring/baseline/{model_name}-pbl-{job_id}",
    }
    print("model output uri: {}".format(output_data["ModelOutputUri"]))

    # Pass these into the training method
    hyperparameters = {}
    if os.path.exists(os.path.join(data_dir, "hyperparameters.json")):
        with open(os.path.join(data_dir, "hyperparameters.json"), "r") as f:
            hyperparameters = json.load(f)
            for i in hyperparameters:
                hyperparameters[i] = str(hyperparameters[i])

    # Define the step functions execution input schema
    execution_input = ExecutionInput(
        schema={
            "GitBranch": str,
            "GitCommitHash": str,
            "DataVersionId": str,
            "ExperimentName": str,
            "TrialName": str,
            "BaselineJobName": str,
            "BaselineOutputUri": str,
            "TrainingJobName": str,
        })

    # Create experiment step
    experiment_step = create_experiment_step(create_experiment_function_name)
    baseline_step = create_baseline_step(input_data, execution_input, region,
                                         sagemaker_role)
    training_step = create_training_step(
        image_uri,
        hyperparameters,
        input_data,
        output_data,
        execution_input,
        query_training_function_name,
        region,
        sagemaker_role,
    )
    workflow_definition = create_graph(experiment_step, baseline_step,
                                       training_step)

    # Create the workflow as the model name
    workflow = Workflow(model_name, workflow_definition, workflow_role_arn)
    print("Creating workflow: {0}-{1}".format(model_name,
                                              sagemaker_project_id))

    # Create output directory
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Write the workflow graph to json
    with open(os.path.join(output_dir, "workflow-graph.json"), "w") as f:
        f.write(workflow.definition.to_json(pretty=True))

    # Write the workflow graph to yml
    with open(os.path.join(output_dir, "workflow-graph.yml"), "w") as f:
        f.write(workflow.get_cloudformation_template())

    # Write the workflow inputs to file
    with open(os.path.join(output_dir, "workflow-input.json"), "w") as f:
        workflow_inputs = {
            "ExperimentName": "{}".format(model_name),
            "TrialName": "{}-{}".format(model_name, job_id),
            "GitBranch": git_branch,
            "GitCommitHash": git_commit_id,
            "DataVersionId": data_verison_id,
            "BaselineJobName": "{}-pbl-{}".format(model_name, job_id),
            "BaselineOutputUri": output_data["BaselineOutputUri"],
            "TrainingJobName": "{}-{}".format(model_name, job_id),
        }
        json.dump(workflow_inputs, f)

    # Write the dev & prod params for CFN
    with open(os.path.join(output_dir, "deploy-model-dev.json"), "w") as f:
        config = get_dev_config(model_name, job_id, deploy_role, image_uri,
                                kms_key_id, sagemaker_project_id)
        json.dump(config, f)
    with open(os.path.join(output_dir, "deploy-model-prd.json"), "w") as f:
        config = get_prd_config(
            model_name,
            job_id,
            deploy_role,
            image_uri,
            kms_key_id,
            notification_arn,
            sagemaker_project_id,
        )
        json.dump(config, f)
Example #26
0
def define_monitor_pipeline(
    account,
    region,
    sm_role,
    workflow_execution_role,
    data_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_data_pipeline.yaml",
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                                it returns an instance of
                                    `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:             If not None, a YAML file will be generated at
                                    this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "PreprocessingJobName": str,
            "PreprocessingInferJobName": str,
            "PreprocessingCodeURL": str,
            "MonitorTrainOutputURL": str,
            "MonitorInferOutputURL": str,
            "InputDataURL": str,
            "InferDataURL": str,
        })
    """
    Custom container for monitoring
    """
    image = "mlmax-processing-monitor"
    img_uri = f"{account}.dkr.ecr.{region}.amazonaws.com/{image}:latest"
    processor = ScriptProcessor(
        image_uri=img_uri,
        role=sm_role,
        instance_count=16,
        instance_type="ml.m5.2xlarge",
        command=["/opt/program/submit"],
        max_runtime_in_seconds=3600,
        env={"mode": "python"},
    )

    #############################
    # Baseline
    #############################
    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["InputDataURL"],
            destination="/opt/ml/processing/train_input",
            input_name="train-input-data",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/profiling/inference",
            destination=execution_input["MonitorTrainOutputURL"],
            output_name="baseline-data",
        )
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing Baseline",
        processor=processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=[
            "--train-test-split-ratio", "0.2", "--mode", "train"
        ],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/monitoring.py",
        ],
    )

    #############################
    # Inference
    #############################
    inputs = [
        ProcessingInput(
            source=execution_input["InferDataURL"],
            destination="/opt/ml/processing/infer_input",
            input_name="infer-input-data",
        ),
        ProcessingInput(
            source=execution_input["MonitorTrainOutputURL"],
            destination="/opt/ml/processing/profiling",
            input_name="baseline-data",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/profiling/inference",
            destination=execution_input["MonitorInferOutputURL"],
            output_name="monitor-output",
        )
    ]

    processing_step_inference = ProcessingStep(
        "SageMaker pre-processing Inference",
        processor=processor,
        job_name=execution_input["PreprocessingInferJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=["--mode", "infer"],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/monitoring.py",
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed")

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )
    processing_step.add_catch(catch_state_processing)
    processing_step_inference.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain([processing_step, processing_step_inference])
    data_pipeline = Workflow(
        name=data_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return data_pipeline
Example #27
0
    job_name_prefix = params['job-name-prefix'] 
    # job_name = job_name_prefix + '-' + timestamp

    sagemaker_role = params['sagemaker-role-arn']
    # prepro_job_name = 'prepro-' + job_name
    # train_job_name = 'train-' + job_name
    # eval_job_name = 'eval-' + job_name
    prepro_job_name = params['prep-job-name']
    train_job_name = params['train-job-name']
    eval_job_name = params['eval-job-name']

    execution_input = ExecutionInput(
        schema={
            "PreprocessingJobName": str,
            "TrainingJobName": str,
            "EvaluationJobName": str,
        }
    )

    pre_processor = create_prepro_processing(params,
                                             prepro_job_name, sagemaker_role)
    processing_step = create_prepro_step(params,
                                         pre_processor, execution_input)

    estimator = create_estimator(params, sagemaker_role)
    training_step = create_training_step(params, estimator, execution_input)

    model_evaluation_processor = create_evaluation_processor(params,
                                                             sagemaker_role)
    evaluation_step = create_evaluation_step(
Example #28
0
from stepfunctions.workflow import Workflow
from stepfunctions.template import TrainingPipeline
from stepfunctions.template.utils import replace_parameters_with_jsonpath

stepfunctions.set_stream_logger(level=logging.INFO)

region = boto3.Session().region_name

# Create a schema for input
event_input = ExecutionInput(
    schema={
        'BuildId': str,
        'Job': str,
        'Model': str,
        'Endpoint': str,
        'ecrArn': str,
        'dataBucketPath': str,
        'authorDate': str,
        'DynamoDBTable': str,
        'triggerSource': str,
        'commitId': str,
    })

# Define static variables determined by appsec
sagemaker_role = 'arn:aws:iam::029186701721:role/qls-28580-acffd3aac73526af-SageMakerRole-R671IS83H4LJ'
workflow_role = 'arn:aws:iam::029186701721:role/qls-28580-acffd3aac73526af-StepFunctionsRole-13AGQ50ASU7XJ'
ecr_Arn = 'latest'
state_machine_arn = 'arn:aws:states:us-west-2:029186701721:stateMachine:trainingStateMachine-Z9vntGZ6ypil'
state_machine_name = 'trainingStateMachine-Z9vntGZ6ypil'
dynamoDBTable = 'qls-28580-acffd3aac73526af-DynamoDBTable-460366LPOX1P'
endpoint_wait_lambda = 'arn:aws:lambda:us-west-2:029186701721:function:qls-28580-acffd3aac73526af-endpointWaitLambda-W5QtjVqVyuJB'
Example #29
0
              framework_version="0.90-2",
              py_version="py3",
              role=sagemaker_execution_role,
              debugger_hook_config=debug_hook_config,
              rules=debug_rules)

# Upload model code to s3

xgb.prepare_workflow_for_training(job_name)
print('uploaded code to: {}'.format(xgb.uploaded_code.s3_prefix))

# Create Workflow steps

execution_input = ExecutionInput(schema={
    'TrainLocation': str,
    'ValidationLocation': str,
    'EndpointName': str
})
execution_params = {
    'TrainLocation': input_train_path,
    'ValidationLocation': input_validation_path,
    'EndpointName': endpoint_name
}

training_step = steps.TrainingStep(
    'Train Step',
    estimator=xgb,
    data={
        'train':
        sagemaker.s3_input(execution_input['TrainLocation'],
                           content_type='libsvm'),
Example #30
0
def test_parallel_state_with_placeholders():
    workflow_input = ExecutionInput()

    parallel_state = Parallel('ParallelState01')

    branch_A = Pass('Branch_A',
                    parameters={
                        'ParamA': parallel_state.output()['A']["B"],
                        'ParamB': workflow_input["Key01"]
                    })

    branch_B = Pass('Branch_B',
                    parameters={
                        'ParamA':
                        "TestValue",
                        'ParamB':
                        parallel_state.output()["Response"]["Key"]["State"]
                    })

    branch_C = Pass('Branch_C',
                    parameters={
                        'ParamA':
                        parallel_state.output()['A']["B"].get("C", float),
                        'ParamB': "HelloWorld"
                    })

    parallel_state.add_branch(branch_A)
    parallel_state.add_branch(branch_B)
    parallel_state.add_branch(branch_C)

    workflow_definition = Chain([parallel_state])
    result = Graph(workflow_definition).to_dict()

    expected_repr = {
        "StartAt": "ParallelState01",
        "States": {
            "ParallelState01": {
                "Type":
                "Parallel",
                "End":
                True,
                "Branches": [{
                    "StartAt": "Branch_A",
                    "States": {
                        "Branch_A": {
                            "Parameters": {
                                "ParamA.$": "$['A']['B']",
                                "ParamB.$": "$$.Execution.Input['Key01']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }, {
                    "StartAt": "Branch_B",
                    "States": {
                        "Branch_B": {
                            "Parameters": {
                                "ParamA": "TestValue",
                                "ParamB.$": "$['Response']['Key']['State']"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }, {
                    "StartAt": "Branch_C",
                    "States": {
                        "Branch_C": {
                            "Parameters": {
                                "ParamA.$": "$['A']['B']['C']",
                                "ParamB": "HelloWorld"
                            },
                            "Type": "Pass",
                            "End": True
                        }
                    }
                }]
            }
        }
    }

    assert result == expected_repr