def main(): sagemaker_session = sagemaker.Session() stepfunctions.set_stream_logger(level=logging.INFO) bucket = 's3://pixiv-image-backet' sagemaker_execution_role = 'arn:aws:iam::829044821271:role/service-role/AmazonSageMaker-ExecutionRole-20200412T194702' workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole' estimator1 = PyTorch(entry_point='train.py', source_dir='projection_discriminator', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) estimator2 = PyTorch(entry_point='train.py', source_dir='wgan_gp', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) training_step1 = steps.TrainingStep(state_id='Train Step1', estimator=estimator1, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) training_step2 = steps.TrainingStep(state_id='Train Step2', estimator=estimator2, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) parallel_state = steps.Parallel(state_id='Parallel', ) parallel_state.add_branch(training_step1) parallel_state.add_branch(training_step2) workflow_definition = steps.Chain([parallel_state]) workflow = Workflow( name='MyTraining-{0}'.format(uuid.uuid4()), definition=workflow_definition, role=workflow_execution_role, ) workflow.create() workflow.execute()
'ValidationLocation': str, 'EndpointName': str }) execution_params = { 'TrainLocation': input_train_path, 'ValidationLocation': input_validation_path, 'EndpointName': endpoint_name } training_step = steps.TrainingStep( 'Train Step', estimator=xgb, data={ 'train': sagemaker.s3_input(execution_input['TrainLocation'], content_type='libsvm'), 'validation': sagemaker.s3_input(execution_input['ValidationLocation'], content_type='libsvm') }, job_name=job_name # Require embedding this to job_name matches uploaded code ) model_step = steps.ModelStep('Save model', model=training_step.get_expected_model(), model_name=job_name) endpoint_config_step = steps.EndpointConfigStep("Create Endpoint Config", endpoint_config_name=job_name, model_name=job_name, initial_instance_count=1,
def create_training_step( image_uri, hyperparameters, input_data, output_data, execution_input, query_training_function_name, region, role, ): # Create the estimator xgb = sagemaker.estimator.Estimator( image_uri, role, instance_count=1, instance_type="ml.m4.xlarge", output_path=output_data[ "ModelOutputUri"], # NOTE: Can't use execution_input here ) # Set the hyperparameters overriding with any defaults hp = { "max_depth": "9", "eta": "0.2", "gamma": "4", "min_child_weight": "300", "subsample": "0.8", "objective": "reg:linear", "early_stopping_rounds": "10", "num_round": "100", } xgb.set_hyperparameters(**{**hp, **hyperparameters}) # Specify the data source s3_input_train = sagemaker.inputs.TrainingInput( s3_data=input_data["TrainingUri"], content_type="csv") s3_input_val = sagemaker.inputs.TrainingInput( s3_data=input_data["ValidationUri"], content_type="csv") data = {"train": s3_input_train, "validation": s3_input_val} # Create the training step training_step = steps.TrainingStep( "Training Job", estimator=xgb, data=data, job_name=execution_input["TrainingJobName"], experiment_config={ "ExperimentName": execution_input["ExperimentName"], "TrialName": execution_input["TrialName"], "TrialComponentDisplayName": "Training", }, tags={ "GitBranch": execution_input["GitBranch"], "GitCommitHash": execution_input["GitCommitHash"], "DataVersionId": execution_input["DataVersionId"], }, result_path="$.TrainingResults", ) # Add the catch training_step.add_catch( stepfunctions.steps.states.Catch( error_equals=["States.TaskFailed"], next_step=stepfunctions.steps.states.Fail( "Training failed", cause="SageMakerTrainingJobFailed"), )) # Must follow the training test model_step = steps.sagemaker.ModelStep( "Save Model", input_path="$.TrainingResults", model=training_step.get_expected_model(), model_name=execution_input["TrainingJobName"], result_path="$.ModelStepResults", ) # Query the training step training_query_step = steps.compute.LambdaStep( "Query Training Results", parameters={ "FunctionName": query_training_function_name, "Payload": { "TrainingJobName.$": "$.TrainingJobName" }, }, result_path="$.QueryTrainingResults", ) check_accuracy_fail_step = steps.states.Fail( "Model Error Too Low", comment="RMSE accuracy higher than threshold") check_accuracy_succeed_step = steps.states.Succeed( "Model Error Acceptable") # TODO: Update query method to query validation error using better result path threshold_rule = steps.choice_rule.ChoiceRule.NumericLessThan( variable=training_query_step.output()["QueryTrainingResults"] ["Payload"]["results"]["TrainingMetrics"][0]["Value"], value=10, ) check_accuracy_step = steps.states.Choice("RMSE < 10") check_accuracy_step.add_choice(rule=threshold_rule, next_step=check_accuracy_succeed_step) check_accuracy_step.default_choice(next_step=check_accuracy_fail_step) # Return the chain of these steps return steps.states.Chain( [training_step, model_step, training_query_step, check_accuracy_step])
# Create an estimator with training specifications for modelA custom_estimatorA = sagemaker.estimator.Estimator( ecr_ArnA, role = sagemaker_role, train_instance_count = 1, train_instance_type = 'ml.m5.xlarge', train_volume_size = 10, output_path=model_bucketA, volume_kms_key=kms_key ) # Create a step to train for modelA training_stepA = steps.TrainingStep( 'Train ModelA', estimator=custom_estimatorA, data={ 'training': sagemaker.inputs.TrainingInput(event_input['dataBucketPath'], content_type='csv') }, job_name="States.Format('JobA-{}', $$.Execution.Input['BuildId'])", result_path='$.train_step_result' ) # Add a retry configuration to the training_step training_stepA.add_retry(SageMaker_throttling_retry) # Create a step to save the modelA model_stepA = steps.ModelStep( 'Create ModelA', model=training_stepA.get_expected_model(), model_name=event_input['ModelA'], result_path='$.save_step_result' ) # Add a retry configuration to the model_step
'Extract, Transform, Load', parameters={"JobName": job_name, "Arguments":{ '--S3_SOURCE': data_source, '--S3_DEST': 's3a://{}/{}/'.format(bucket, project_name), '--TRAIN_KEY': train_prefix + '/', '--VAL_KEY': val_prefix +'/'} } ) training_step = steps.TrainingStep( 'Model Training', estimator=xgb, data={ 'train': s3_input(train_data, content_type='csv'), 'validation': s3_input(validation_data, content_type='csv') }, job_name=training_job_name, wait_for_completion=True ) model_step = steps.ModelStep( 'Save Model', model=training_step.get_expected_model(), model_name=execution_input['ModelName'], result_path='$.ModelStepResults' ) lambda_step = steps.compute.LambdaStep( 'Query Training Results', parameters={
'Execute AWS Batch job', parameters={ "JobDefinition":execution_input['BatchJobDefinition'], "JobName": execution_input['BatchJobName'], "JobQueue": execution_input['BatchJobQueue'] } ) ## SageMaker の学習ジョブを実行するステップ estimator = create_estimator() data_path = {'train': args.data_path} training_step = steps.TrainingStep( 'Train Step', estimator=estimator, data=data_path, job_name=execution_input['TrainJobName'], wait_for_completion=False # SFnを実行した後に Bitbucket へプルリクを上げるように変更したため、ここは True で良いかも。 ) # 各 Step を連結 chain_list = [etl_step, training_step] workflow_definition = steps.Chain(chain_list) # Workflow の作成 workflow = Workflow( name=FLOW_NAME, definition=workflow_definition, role=WORKFLOW_ROLE, execution_input=execution_input )
custom_estimator = sagemaker.estimator.Estimator( ecr_Arn, sagemaker_role, train_instance_count=1, train_instance_type='ml.m5.large', train_volume_size=10, output_path=model_artifact_bucket, volume_kms_key=kms_key, max_run=300) # Create a step to train the model training_step = steps.TrainingStep('Train', estimator=custom_estimator, data={ 'training': sagemaker.inputs.TrainingInput( event_input['dataBucketPath'], content_type='csv') }, job_name=event_input['Job'], result_path='$.train_step_result') # Add a retry configuration to the training_step training_step.add_retry(SageMaker_throttling_retry) # Create a step to create the model model_step = steps.ModelStep('Create model', model=training_step.get_expected_model(), model_name=event_input['Model'], result_path='$.create_step_result') # Add a retry configuration to the model_step model_step.add_retry(SageMaker_throttling_retry)