'Arg2' ], code=step_code ), cluster_id=sfn.TaskInput.from_data_at('$.LaunchClusterResult.ClusterId').value) phase_1.branch(step_task) # Define an AddStep Task for the Validation Step validate_phase_1 = emr_chains.AddStepWithArgumentOverrides( stack, 'ValidatePhase1', emr_step=emr_code.EMRStep( name='Validate Phase 1', jar='s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar', args=[ f'{step_code.s3_path}/phase_1/test_validation.sh', 'Arg1', 'Arg2' ], code=step_code ), cluster_id=sfn.TaskInput.from_data_at('$.LaunchClusterResult.ClusterId').value, result_path='$.ValidatePhase1Result', fail_chain=fail) # Create a Parallel Task for the Phase 2 Steps phase_2 = sfn.Parallel(stack, 'Phase2', result_path='$.Result.Phase2') # Add a Failure catch to our Parallel phase phase_2.add_catch(terminate_failed_cluster, errors=['States.ALL'], result_path='$.Error')
def __init__(self, scope: core.Construct, id: str, emr_launch_stack, artifact_bucket, output_bucket, **kwargs): super().__init__(scope, id, **kwargs) launch_function = emr_launch_stack.launch_function # Create DynamoDB table for tracking dynamo_table = dynamo.Table( self, "dynamotable", partition_key=dynamo.Attribute(name="BatchId", type=dynamo.AttributeType.STRING), sort_key=dynamo.Attribute(name="Name", type=dynamo.AttributeType.STRING), billing_mode=dynamo.BillingMode.PAY_PER_REQUEST) emr_role = aws_iam.Role.from_role_arn( self, "emr_role_iam", role_arn=emr_launch_stack.instance_role_arn) emr_role.add_to_policy( aws_iam.PolicyStatement(actions=["dynamodb:*"], resources=[dynamo_table.table_arn])) emr_role.add_to_policy( aws_iam.PolicyStatement(actions=[ "logs:CreateLogStream", "logs:DescribeLogStreams", "logs:CreateLogGroup", "logs:PutLogEvents", "ec2:DescribeTags" ], resources=["*"])) # SNS Topics for Success/Failures messages from our Pipeline success_topic = sns.Topic(self, 'SuccessTopic') failure_topic = sns.Topic(self, 'FailureTopic') # Upload artifacts to S3 step_code = s3d.BucketDeployment( self, id='sparkscript', destination_bucket=artifact_bucket, destination_key_prefix='steps', sources=[ s3d.Source.asset('infrastructure/emr_orchestration/steps/') ]) # Create a Chain to receive Failure messages fail = emr_chains.Fail(self, 'FailChain', message=sfn.TaskInput.from_data_at('$.Error'), subject='Pipeline Failure', topic=failure_topic) # # Define a Task to Terminate the Cluster on failure terminate_failed_cluster = emr_tasks.TerminateClusterBuilder.build( self, 'TerminateFailedCluster', name='Terminate Failed Cluster', cluster_id=sfn.TaskInput.from_data_at( '$.LaunchClusterResult.ClusterId').value, result_path='$.TerminateResult').add_catch(fail, errors=['States.ALL'], result_path='$.Error') terminate_failed_cluster.next(fail) # Use a NestedStateMachine to launch the cluster launch_cluster = emr_chains.NestedStateMachine( self, 'NestedStateMachine', name='Launch Cluster StateMachine', state_machine=launch_function.state_machine, fail_chain=fail) pyspark_step = emr_chains.AddStepWithArgumentOverrides( self, 'PySparkSceneDetection', emr_step=emr_code.EMRStep( name=f'Scene Detection - PySpark Job', jar='command-runner.jar', args=[ 'spark-submit', '--master', 'yarn', '--deploy-mode', 'cluster', '--packages', 'com.audienceproject:spark-dynamodb_2.12:1.1.2', os.path.join(f's3://{artifact_bucket.bucket_name}', 'steps', 'scene_detection.py'), '--batch-id', 'DynamoDB.BatchId', '--batch-metadata-table-name', dynamo_table.table_name, '--output-bucket', output_bucket.bucket_name, '--synchronized-table-name', 'synchronized-signals' ]), cluster_id=sfn.TaskInput.from_data_at( '$.LaunchClusterResult.ClusterId').value, result_path='$.PySparkResult', fail_chain=terminate_failed_cluster) # Define a Task to Terminate the Cluster terminate_cluster = emr_tasks.TerminateClusterBuilder.build( self, 'TerminateCluster', name='Terminate Cluster', cluster_id=sfn.TaskInput.from_data_at( '$.LaunchClusterResult.ClusterId').value, result_path='$.TerminateResult').add_catch(fail, errors=['States.ALL'], result_path='$.Error') # A Chain for Success notification when the pipeline completes success = emr_chains.Success( self, 'SuccessChain', message=sfn.TaskInput.from_data_at('$.TerminateResult'), subject='Pipeline Succeeded', topic=success_topic) # Assemble the Pipeline definition = sfn.Chain \ .start(launch_cluster) \ .next(pyspark_step) \ .next(terminate_cluster) \ .next(success) # Create the State Machine self.state_machine = sfn.StateMachine( self, 'SceneDetectionStateMachine', state_machine_name='scene-detection-pipeline', definition=definition) self.dynamo_table = dynamo_table
def test_add_step_with_argument_overrides(): default_fragment_json = { 'Type': 'Parallel', 'End': True, 'Branches': [{ 'StartAt': 'test-fragment: test-step - Override Args', 'States': { 'test-fragment: test-step - Override Args': { 'Next': 'test-fragment: test-step', 'Catch': [{ 'ErrorEquals': ['States.ALL'], 'ResultPath': '$.Error', 'Next': 'test-fail' }], 'Parameters': { 'ExecutionInput.$': '$$.Execution.Input', 'StepName': 'test-step', 'Args': ['Arg1', 'Arg2'] }, 'Type': 'Task', 'Resource': { 'Fn::GetAtt': ['OverrideStepArgsE9376C9F', 'Arn'] }, 'ResultPath': '$.test-fragmentResultArgs' }, 'test-fragment: test-step': { 'End': True, 'Catch': [{ 'ErrorEquals': ['States.ALL'], 'ResultPath': '$.Error', 'Next': 'test-fail' }], 'Parameters': { 'ClusterId': 'test-cluster-id', 'Step': { 'Name': 'test-step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'Jar', 'MainClass': 'Main', 'Args.$': '$.test-fragmentResultArgs', 'Properties': [] } } }, 'Type': 'Task', 'Resource': { 'Fn::Join': [ '', [ 'arn:', { 'Ref': 'AWS::Partition' }, ':states:::elasticmapreduce:addStep.sync' ] ] } }, 'test-fail': { 'Type': 'Fail' } } }] } stack = core.Stack(core.App(), 'test-stack') fragment = emr_chains.AddStepWithArgumentOverrides( stack, 'test-fragment', emr_step=emr_code.EMRStep('test-step', 'Jar', 'Main', ['Arg1', 'Arg2']), cluster_id='test-cluster-id', fail_chain=sfn.Fail(stack, 'test-fail')) print_and_assert(default_fragment_json, fragment)