Example #1
0
def main():
    sagemaker_session = sagemaker.Session()
    stepfunctions.set_stream_logger(level=logging.INFO)

    bucket = 's3://pixiv-image-backet'

    sagemaker_execution_role = 'arn:aws:iam::829044821271:role/service-role/AmazonSageMaker-ExecutionRole-20200412T194702'
    workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole'

    estimator1 = PyTorch(entry_point='train.py',
                         source_dir='projection_discriminator',
                         role=sagemaker_execution_role,
                         framework_version='1.4.0',
                         train_instance_count=2,
                         train_instance_type='ml.m5.2xlarge',
                         hyperparameters={
                             'train_epoch': 1,
                         })

    estimator2 = PyTorch(entry_point='train.py',
                         source_dir='wgan_gp',
                         role=sagemaker_execution_role,
                         framework_version='1.4.0',
                         train_instance_count=2,
                         train_instance_type='ml.m5.2xlarge',
                         hyperparameters={
                             'train_epoch': 1,
                         })

    training_step1 = steps.TrainingStep(state_id='Train Step1',
                                        estimator=estimator1,
                                        data={
                                            'training': bucket,
                                        },
                                        job_name='PD-Train-{0}'.format(
                                            uuid.uuid4()))

    training_step2 = steps.TrainingStep(state_id='Train Step2',
                                        estimator=estimator2,
                                        data={
                                            'training': bucket,
                                        },
                                        job_name='PD-Train-{0}'.format(
                                            uuid.uuid4()))

    parallel_state = steps.Parallel(state_id='Parallel', )

    parallel_state.add_branch(training_step1)
    parallel_state.add_branch(training_step2)

    workflow_definition = steps.Chain([parallel_state])

    workflow = Workflow(
        name='MyTraining-{0}'.format(uuid.uuid4()),
        definition=workflow_definition,
        role=workflow_execution_role,
    )

    workflow.create()
    workflow.execute()
Example #2
0
    'ValidationLocation': str,
    'EndpointName': str
})
execution_params = {
    'TrainLocation': input_train_path,
    'ValidationLocation': input_validation_path,
    'EndpointName': endpoint_name
}

training_step = steps.TrainingStep(
    'Train Step',
    estimator=xgb,
    data={
        'train':
        sagemaker.s3_input(execution_input['TrainLocation'],
                           content_type='libsvm'),
        'validation':
        sagemaker.s3_input(execution_input['ValidationLocation'],
                           content_type='libsvm')
    },
    job_name=job_name  # Require embedding this to job_name matches uploaded code
)

model_step = steps.ModelStep('Save model',
                             model=training_step.get_expected_model(),
                             model_name=job_name)

endpoint_config_step = steps.EndpointConfigStep("Create Endpoint Config",
                                                endpoint_config_name=job_name,
                                                model_name=job_name,
                                                initial_instance_count=1,
def create_training_step(
    image_uri,
    hyperparameters,
    input_data,
    output_data,
    execution_input,
    query_training_function_name,
    region,
    role,
):
    # Create the estimator
    xgb = sagemaker.estimator.Estimator(
        image_uri,
        role,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        output_path=output_data[
            "ModelOutputUri"],  # NOTE: Can't use execution_input here
    )

    # Set the hyperparameters overriding with any defaults
    hp = {
        "max_depth": "9",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "300",
        "subsample": "0.8",
        "objective": "reg:linear",
        "early_stopping_rounds": "10",
        "num_round": "100",
    }
    xgb.set_hyperparameters(**{**hp, **hyperparameters})

    # Specify the data source
    s3_input_train = sagemaker.inputs.TrainingInput(
        s3_data=input_data["TrainingUri"], content_type="csv")
    s3_input_val = sagemaker.inputs.TrainingInput(
        s3_data=input_data["ValidationUri"], content_type="csv")
    data = {"train": s3_input_train, "validation": s3_input_val}

    # Create the training step
    training_step = steps.TrainingStep(
        "Training Job",
        estimator=xgb,
        data=data,
        job_name=execution_input["TrainingJobName"],
        experiment_config={
            "ExperimentName": execution_input["ExperimentName"],
            "TrialName": execution_input["TrialName"],
            "TrialComponentDisplayName": "Training",
        },
        tags={
            "GitBranch": execution_input["GitBranch"],
            "GitCommitHash": execution_input["GitCommitHash"],
            "DataVersionId": execution_input["DataVersionId"],
        },
        result_path="$.TrainingResults",
    )

    # Add the catch
    training_step.add_catch(
        stepfunctions.steps.states.Catch(
            error_equals=["States.TaskFailed"],
            next_step=stepfunctions.steps.states.Fail(
                "Training failed", cause="SageMakerTrainingJobFailed"),
        ))

    # Must follow the training test
    model_step = steps.sagemaker.ModelStep(
        "Save Model",
        input_path="$.TrainingResults",
        model=training_step.get_expected_model(),
        model_name=execution_input["TrainingJobName"],
        result_path="$.ModelStepResults",
    )

    # Query the training step
    training_query_step = steps.compute.LambdaStep(
        "Query Training Results",
        parameters={
            "FunctionName": query_training_function_name,
            "Payload": {
                "TrainingJobName.$": "$.TrainingJobName"
            },
        },
        result_path="$.QueryTrainingResults",
    )

    check_accuracy_fail_step = steps.states.Fail(
        "Model Error Too Low", comment="RMSE accuracy higher than threshold")

    check_accuracy_succeed_step = steps.states.Succeed(
        "Model Error Acceptable")

    # TODO: Update query method to query validation error using better result path
    threshold_rule = steps.choice_rule.ChoiceRule.NumericLessThan(
        variable=training_query_step.output()["QueryTrainingResults"]
        ["Payload"]["results"]["TrainingMetrics"][0]["Value"],
        value=10,
    )

    check_accuracy_step = steps.states.Choice("RMSE < 10")

    check_accuracy_step.add_choice(rule=threshold_rule,
                                   next_step=check_accuracy_succeed_step)
    check_accuracy_step.default_choice(next_step=check_accuracy_fail_step)

    # Return the chain of these steps
    return steps.states.Chain(
        [training_step, model_step, training_query_step, check_accuracy_step])
Example #4
0
# Create an estimator with training specifications for modelA
custom_estimatorA = sagemaker.estimator.Estimator(
    ecr_ArnA,
    role = sagemaker_role,
    train_instance_count = 1,
    train_instance_type = 'ml.m5.xlarge',
    train_volume_size = 10,
    output_path=model_bucketA,
    volume_kms_key=kms_key
)
# Create a step to train for modelA
training_stepA = steps.TrainingStep(
    'Train ModelA',
    estimator=custom_estimatorA,
    data={
        'training': sagemaker.inputs.TrainingInput(event_input['dataBucketPath'], content_type='csv')
    },
    job_name="States.Format('JobA-{}', $$.Execution.Input['BuildId'])",
    result_path='$.train_step_result'
)
# Add a retry configuration to the training_step
training_stepA.add_retry(SageMaker_throttling_retry)

# Create a step to save the modelA
model_stepA = steps.ModelStep(
    'Create ModelA',
    model=training_stepA.get_expected_model(),
    model_name=event_input['ModelA'],
    result_path='$.save_step_result'
)
# Add a retry configuration to the model_step
Example #5
0
    'Extract, Transform, Load',
    parameters={"JobName": job_name,
                "Arguments":{
                    '--S3_SOURCE': data_source,
                    '--S3_DEST': 's3a://{}/{}/'.format(bucket, project_name),
                    '--TRAIN_KEY': train_prefix + '/',
                    '--VAL_KEY': val_prefix +'/'}
               }
)


training_step = steps.TrainingStep(
    'Model Training', 
    estimator=xgb,
    data={
        'train': s3_input(train_data, content_type='csv'),
        'validation': s3_input(validation_data, content_type='csv')
    },
    job_name=training_job_name,
    wait_for_completion=True
)

model_step = steps.ModelStep(
    'Save Model',
    model=training_step.get_expected_model(),
    model_name=execution_input['ModelName'],
    result_path='$.ModelStepResults'
)

lambda_step = steps.compute.LambdaStep(
    'Query Training Results',
    parameters={  
Example #6
0
        'Execute AWS Batch job',
        parameters={
            "JobDefinition":execution_input['BatchJobDefinition'],
            "JobName": execution_input['BatchJobName'],
            "JobQueue": execution_input['BatchJobQueue'] 
            }
    )

    ## SageMaker の学習ジョブを実行するステップ
    estimator = create_estimator()
    data_path = {'train': args.data_path}

    training_step = steps.TrainingStep(
        'Train Step', 
        estimator=estimator,
        data=data_path,
        job_name=execution_input['TrainJobName'],  
        wait_for_completion=False  # SFnを実行した後に Bitbucket へプルリクを上げるように変更したため、ここは True で良いかも。
    )

    # 各 Step を連結
    chain_list = [etl_step, training_step]
    workflow_definition = steps.Chain(chain_list)

    # Workflow の作成
    workflow = Workflow(
        name=FLOW_NAME,
        definition=workflow_definition,
        role=WORKFLOW_ROLE,
        execution_input=execution_input
    )
Example #7
0
custom_estimator = sagemaker.estimator.Estimator(
    ecr_Arn,
    sagemaker_role,
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    train_volume_size=10,
    output_path=model_artifact_bucket,
    volume_kms_key=kms_key,
    max_run=300)

# Create a step to train the model
training_step = steps.TrainingStep('Train',
                                   estimator=custom_estimator,
                                   data={
                                       'training':
                                       sagemaker.inputs.TrainingInput(
                                           event_input['dataBucketPath'],
                                           content_type='csv')
                                   },
                                   job_name=event_input['Job'],
                                   result_path='$.train_step_result')
# Add a retry configuration to the training_step
training_step.add_retry(SageMaker_throttling_retry)

# Create a step to create the model
model_step = steps.ModelStep('Create model',
                             model=training_step.get_expected_model(),
                             model_name=event_input['Model'],
                             result_path='$.create_step_result')
# Add a retry configuration to the model_step
model_step.add_retry(SageMaker_throttling_retry)