def emr_step_task_group(script_name, cluster_id, aws_conn_id, dag): step = [{ 'Name': f'Run {script_name}.py', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ 'spark-submit', '--deploy-mode', 'client', '--py-files', f'{file_schema}{bucket_name}scripts/utils.zip', f'{file_schema}{bucket_name}scripts/{script_name}.py', '--config-path', f'{file_schema}{bucket_name}scripts/config.yaml' ] } }] add_step = EmrAddStepsOperator(task_id='add_step', job_flow_id=cluster_id, aws_conn_id=aws_conn_id, steps=step, dag=dag) wait_step_completion = EmrStepSensor( task_id='wait_step_completion', job_flow_id=cluster_id, aws_conn_id=aws_conn_id, step_id= f"{{{{ ti.xcom_pull(task_ids='run_{script_name}.add_step')[0] }}}}", dag=dag) add_step.set_downstream(wait_step_completion) return add_step, wait_step_completion
def test_init_with_cluster_name(self): expected_job_flow_id = 'j-1231231234' self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): with patch( 'airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name' ) as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = expected_job_flow_id operator = EmrAddStepsOperator( task_id='test_task', job_flow_name='test_cluster', cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args), ) operator.execute(self.mock_context) ti = self.mock_context['ti'] ti.xcom_push.assert_called_once_with(key='job_flow_id', value=expected_job_flow_id)
def test_render_template_from_file(self): dag = DAG(dag_id='test_file', default_args=self.args, template_searchpath=TEMPLATE_SEARCHPATH, template_undefined=StrictUndefined) file_steps = [{ 'Name': 'test_step1', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['/usr/lib/spark/bin/run-example1'] } }] execution_date = timezone.utcnow() self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN test_task = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps='steps.j2.json', dag=dag) with patch('boto3.session.Session', self.boto3_session_mock): ti = TaskInstance(task=test_task, execution_date=execution_date) ti.run() self.emr_client_mock.add_job_flow_steps.assert_called_once_with( JobFlowId='j-8989898989', Steps=file_steps)
def test_init_with_nonexistent_cluster_name(self): cluster_name = 'test_cluster' with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = None operator = EmrAddStepsOperator( task_id='test_task', job_flow_name=cluster_name, cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args) ) with self.assertRaises(AirflowException) as error: operator.execute(self.mock_context) self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
def setUp(self): self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) self.mock_context = MagicMock() self.operator = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=self.args))
def test_render_template_2(self): dag = DAG( dag_id='test_xcom', default_args=self.args) xcom_steps = [ { 'Name': 'test_step1', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example1' ] } }, { 'Name': 'test_step2', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example2' ] } } ] make_steps = DummyOperator(task_id='make_steps', dag=dag, owner='airflow') execution_date = timezone.utcnow() ti1 = TaskInstance(task=make_steps, execution_date=execution_date) ti1.xcom_push(key='steps', value=xcom_steps) self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN test_task = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps="{{ ti.xcom_pull(task_ids='make_steps',key='steps') }}", dag=dag) with patch('boto3.session.Session', self.boto3_session_mock): ti = TaskInstance(task=test_task, execution_date=execution_date) ti.run() self.emr_client_mock.add_job_flow_steps.assert_called_once_with( JobFlowId='j-8989898989', Steps=xcom_steps)
def Workflow_0(config): if config.fabric == "emr": workflow_id = "9" workflow_version = "latest" workflow_jar = f"s3://{config.s3Bucket}/prophecy/jars/9/latest/workflow.jar" prophecy_libs_jar = f"{config.prophecyLibsJar}" executor_memory = "1g" executor_cores = "4" num_executors = "6" driver_memory = "1g" driver_cores = "2" job_flow_id = config.cluster_id spark_steps = [{ "Name": "Compute_Step", "ActionOnFailure": "CONTINUE", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": ["spark-submit", "--executor-memory", executor_memory, "--executor-cores", executor_cores, "--num-executors", num_executors, "--driver-memory", driver_memory, "--driver-cores", driver_cores, "--conf", "spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4", "--conf", "spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4", "--deploy-mode", "cluster", "--class", "Main", "--jars", workflow_jar, prophecy_libs_jar, "-C", "A=B", "-C", "fabricName=" + config.fabric] } }] step_adder = EmrAddStepsOperator( task_id = "Workflow_0", job_flow_id = job_flow_id, aws_conn_id = "aws_default_pankaj", steps = spark_steps, trigger_rule = "all_success" ) step_checker = EmrStepSensor( task_id = "Workflow_0WatchSteps", job_flow_id = job_flow_id, step_id = "{{ task_instance.xcom_pull(task_ids='Workflow_0', key='return_value')[0] }}", aws_conn_id = "aws_default_pankaj" ) step_adder >> step_checker return step_adder, step_checker
dagrun_timeout=timedelta(hours=2), start_date=datetime(2021, 1, 1), schedule_interval='0 3 * * *', catchup=False, tags=['example'], ) as dag: # [START howto_operator_emr_manual_steps_tasks] cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id=cluster_creator.output, steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id=cluster_creator.output, step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", ) cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id=cluster_creator.output) step_adder >> step_checker >> cluster_remover # [END howto_operator_emr_manual_steps_tasks]
description='DAG test', schedule_interval=timedelta(days=1), ) create_emr_cluster = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', region_name='us-east-2', job_flow_overrides=JOB_FLOW_OVERRIDES, dag=dag) emr_step_1 = EmrAddStepsOperator( task_id='emr_step1', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=step1, dag=dag) emr_step_2 = EmrAddStepsOperator( task_id='emr_step2', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=step2, dag=dag) emr_step_sensor = EmrStepSensor( task_id='watch_step', job_flow_id=
op_kwargs={ "aws_conn": "my_aws_conn", "bucket": "my_bucket", "prefix": "file_name" }, provide_context=False) create_job_flow_task = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=cluster_conf) add_step_task = EmrAddStepsOperator( task_id='My_first_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='my_aws_conn', steps=my_first_job) watch_prev_step_task = EmrStepSensor( task_id='watch_prev_step', job_flow_id= "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}", step_id= "{{task_instance.xcom_pull(task_ids='My_first_job', key='return_value')}}", aws_conn_id='aws_default') terminate_job_flow_task = EmrTerminateJobFlowOperator( task_id='terminate_job_flow', job_flow_id= "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
'ActionOnFailure': action, 'HadoopJarStep': { 'Jar': 's3://ap-south-1.elasticmapreduce/libs/script-runner/script-runner.jar', 'Args': [ '/usr/bin/aws', 'emr', 'modify-instance-fleet', '--cluster-id', '{}'.format(cluster_id), '--instance-fleet', 'InstanceFleetId={},TargetOnDemandCapacity={},TargetSpotCapacity={}'.format( instance_fleet_id, target_on_demand_capacity, target_spot_capacity) ] } }] step_1 = EmrAddStepsOperator( task_id='step_1', aws_conn_id='aws_default', steps=add_scala_steps('jar-location', 'class-name', 'CONTINUE', "{{ dag_run.conf['date'] }}"), dag=dag ) sensor_1 = EmrStepSensor( task_id='sensor_1', job_flow_id="{{ dag_run.conf['cluster_id'] }}", step_id="{{ task_instance.xcom_pull('step_1', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag ) step_2 = EmrAddStepsOperator( task_id='step_2', aws_conn_id='aws_default',
}, start_date=datetime(2021, 1, 1, 0, 0), schedule_interval="@daily", catchup=False, ) as dag: create_cluster = EmrCreateJobFlowOperator( task_id="create_cluster", job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", ) add_step_load_raw_data = EmrAddStepsOperator( task_id="add_step_load_raw_data", job_flow_id=create_cluster.output, aws_conn_id="aws_default", steps=SparkSteps.LOAD_RAW_DATA, ) wait_for_step_load_raw_data = EmrStepSensor( task_id="wait_for_step_load_raw_data", job_flow_id=create_cluster.output, step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}", aws_conn_id="aws_default", ) add_step_transform = EmrAddStepsOperator( task_id="add_step_transform", job_flow_id=create_cluster.output, aws_conn_id="aws_default", steps=SparkSteps.TRANSFORM,
tags=["emr demo", "spark", "pyspark"], ) as dag: begin = DummyOperator(task_id="begin") end = DummyOperator(task_id="end") cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=get_object( "job_flow_overrides/job_flow_overrides.json", work_bucket), ) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=get_object("emr_steps/emr_steps.json", work_bucket), ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id="aws_default", ) begin >> cluster_creator >> step_adder >> step_checker >> end
"spark.yarn.maxAppAttempts=1", "--class", "com.meta.mr.multikey.partner.PartnerStageOne", "{{ dag_run.conf['pidMrMultikeyJarPath'] }}", "s3://{{ dag_run.conf['metaBucketName'] }}/{{ dag_run.conf['instanceId'] }}", "s3://{{ dag_run.conf['advBucketName'] }}/{{ dag_run.conf['instanceId'] }}", "{{ dag_run.conf['outputPath'] }}", "{{ dag_run.conf['inputPath'] }}", ], }, }] stage1_adder = EmrAddStepsOperator( task_id="add_stage_1", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEP_1, dag=dag, ) stage1_checker = EmrStepSensor( task_id="watch_stage1", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_stage_1', key='return_value')[0] }}", aws_conn_id="aws_default", dag=dag, ) sensor_stage2_key = S3KeySensor(
class TestEmrAddStepsOperator(unittest.TestCase): # When _config = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', '{{ macros.ds_add(ds, -1) }}', '{{ ds }}' ] } }] def setUp(self): self.args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) self.mock_context = MagicMock() self.operator = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=self.args) ) def test_init(self): self.assertEqual(self.operator.job_flow_id, 'j-8989898989') self.assertEqual(self.operator.aws_conn_id, 'aws_default') def test_render_template(self): ti = TaskInstance(self.operator, DEFAULT_DATE) ti.render_templates() expected_args = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), DEFAULT_DATE.strftime("%Y-%m-%d"), ] } }] self.assertListEqual(self.operator.steps, expected_args) def test_render_template_2(self): dag = DAG( dag_id='test_xcom', default_args=self.args) xcom_steps = [ { 'Name': 'test_step1', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example1' ] } }, { 'Name': 'test_step2', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example2' ] } } ] make_steps = DummyOperator(task_id='make_steps', dag=dag, owner='airflow') execution_date = timezone.utcnow() ti1 = TaskInstance(task=make_steps, execution_date=execution_date) ti1.xcom_push(key='steps', value=xcom_steps) self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN test_task = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps="{{ ti.xcom_pull(task_ids='make_steps',key='steps') }}", dag=dag) with patch('boto3.session.Session', self.boto3_session_mock): ti = TaskInstance(task=test_task, execution_date=execution_date) ti.run() self.emr_client_mock.add_job_flow_steps.assert_called_once_with( JobFlowId='j-8989898989', Steps=xcom_steps) def test_execute_returns_step_id(self): self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): self.assertEqual(self.operator.execute(self.mock_context), ['s-2LH3R5GW3A53T']) def test_init_with_cluster_name(self): expected_job_flow_id = 'j-1231231234' self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = expected_job_flow_id operator = EmrAddStepsOperator( task_id='test_task', job_flow_name='test_cluster', cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args) ) operator.execute(self.mock_context) ti = self.mock_context['ti'] ti.xcom_push.assert_called_once_with(key='job_flow_id', value=expected_job_flow_id) def test_init_with_nonexistent_cluster_name(self): cluster_name = 'test_cluster' with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = None operator = EmrAddStepsOperator( task_id='test_task', job_flow_name=cluster_name, cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args) ) with self.assertRaises(AirflowException) as error: operator.execute(self.mock_context) self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *', tags=['example'], ) as dag: # [START howto_operator_emr_manual_steps_tasks] cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEPS) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id='aws_default') cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
op_kwargs={ 'name': file['spark_step_args']['name'], 'python_dependencies': file['spark_step_args']['python_dependencies'], 'jars': file['spark_step_args']['jars'], 's3_input': file['spark_step_args']['s3_input'], 's3_script': file['spark_step_args']['s3_script'], 's3_output': file['spark_step_args']['s3_output'], 'db_table': file['spark_step_args']['db_table'] } ) entity_task_list.append(render_spark_step) # add spark step to emr add_step = EmrAddStepsOperator( task_id='add_step_{}'.format(key), job_flow_id=job_flow_id, aws_conn_id='aws_default', steps="{{ task_instance.xcom_pull(task_ids='" + 'render_spark_step_{}'.format(key) + "', key='return_value') }}", ) entity_task_list.append(add_step) # wait for the step to complete watch_step = EmrStepSensor( task_id='watch_step_{}'.format(key), job_flow_id=job_flow_id, step_id="{{ task_instance.xcom_pull(task_ids='" + 'add_step_{}'.format(key) + "', key='return_value')[0] }}", aws_conn_id='aws_default' ) entity_task_list.append(watch_step) # if file has sql quality steps, then create additional tasks in DAG if file['sql_step_args']: