def emr_step_task_group(script_name, cluster_id, aws_conn_id, dag):
    step = [{
        'Name': f'Run {script_name}.py',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar':
            'command-runner.jar',
            'Args': [
                'spark-submit', '--deploy-mode', 'client', '--py-files',
                f'{file_schema}{bucket_name}scripts/utils.zip',
                f'{file_schema}{bucket_name}scripts/{script_name}.py',
                '--config-path',
                f'{file_schema}{bucket_name}scripts/config.yaml'
            ]
        }
    }]
    add_step = EmrAddStepsOperator(task_id='add_step',
                                   job_flow_id=cluster_id,
                                   aws_conn_id=aws_conn_id,
                                   steps=step,
                                   dag=dag)

    wait_step_completion = EmrStepSensor(
        task_id='wait_step_completion',
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        step_id=
        f"{{{{ ti.xcom_pull(task_ids='run_{script_name}.add_step')[0] }}}}",
        dag=dag)
    add_step.set_downstream(wait_step_completion)
    return add_step, wait_step_completion
    def test_init_with_cluster_name(self):
        expected_job_flow_id = 'j-1231231234'

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            with patch(
                    'airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name'
            ) as mock_get_cluster_id_by_name:
                mock_get_cluster_id_by_name.return_value = expected_job_flow_id

                operator = EmrAddStepsOperator(
                    task_id='test_task',
                    job_flow_name='test_cluster',
                    cluster_states=['RUNNING', 'WAITING'],
                    aws_conn_id='aws_default',
                    dag=DAG('test_dag_id', default_args=self.args),
                )

                operator.execute(self.mock_context)

        ti = self.mock_context['ti']

        ti.xcom_push.assert_called_once_with(key='job_flow_id',
                                             value=expected_job_flow_id)
예제 #3
0
    def test_render_template_from_file(self):
        dag = DAG(dag_id='test_file',
                  default_args=self.args,
                  template_searchpath=TEMPLATE_SEARCHPATH,
                  template_undefined=StrictUndefined)

        file_steps = [{
            'Name': 'test_step1',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': ['/usr/lib/spark/bin/run-example1']
            }
        }]

        execution_date = timezone.utcnow()

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        test_task = EmrAddStepsOperator(task_id='test_task',
                                        job_flow_id='j-8989898989',
                                        aws_conn_id='aws_default',
                                        steps='steps.j2.json',
                                        dag=dag)

        with patch('boto3.session.Session', self.boto3_session_mock):
            ti = TaskInstance(task=test_task, execution_date=execution_date)
            ti.run()

        self.emr_client_mock.add_job_flow_steps.assert_called_once_with(
            JobFlowId='j-8989898989', Steps=file_steps)
예제 #4
0
    def test_init_with_nonexistent_cluster_name(self):
        cluster_name = 'test_cluster'

        with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \
                as mock_get_cluster_id_by_name:
            mock_get_cluster_id_by_name.return_value = None

            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_name=cluster_name,
                cluster_states=['RUNNING', 'WAITING'],
                aws_conn_id='aws_default',
                dag=DAG('test_dag_id', default_args=self.args)
            )

            with self.assertRaises(AirflowException) as error:
                operator.execute(self.mock_context)
            self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
예제 #5
0
    def setUp(self):
        self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        self.mock_context = MagicMock()

        self.operator = EmrAddStepsOperator(task_id='test_task',
                                            job_flow_id='j-8989898989',
                                            aws_conn_id='aws_default',
                                            steps=self._config,
                                            dag=DAG('test_dag_id',
                                                    default_args=self.args))
예제 #6
0
    def test_render_template_2(self):
        dag = DAG(
            dag_id='test_xcom', default_args=self.args)

        xcom_steps = [
            {
                'Name': 'test_step1',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example1'
                    ]
                }
            }, {
                'Name': 'test_step2',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example2'
                    ]
                }
            }
        ]

        make_steps = DummyOperator(task_id='make_steps', dag=dag, owner='airflow')
        execution_date = timezone.utcnow()
        ti1 = TaskInstance(task=make_steps, execution_date=execution_date)
        ti1.xcom_push(key='steps', value=xcom_steps)

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        test_task = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps="{{ ti.xcom_pull(task_ids='make_steps',key='steps') }}",
            dag=dag)

        with patch('boto3.session.Session', self.boto3_session_mock):
            ti = TaskInstance(task=test_task, execution_date=execution_date)
            ti.run()

        self.emr_client_mock.add_job_flow_steps.assert_called_once_with(
            JobFlowId='j-8989898989',
            Steps=xcom_steps)
예제 #7
0
def Workflow_0(config):
    if config.fabric == "emr":
        workflow_id = "9"
        workflow_version = "latest"
        workflow_jar = f"s3://{config.s3Bucket}/prophecy/jars/9/latest/workflow.jar"
        prophecy_libs_jar = f"{config.prophecyLibsJar}"
        executor_memory = "1g"
        executor_cores = "4"
        num_executors = "6"
        driver_memory = "1g"
        driver_cores = "2"
        job_flow_id = config.cluster_id
        spark_steps = [{
                         "Name": "Compute_Step",
                         "ActionOnFailure": "CONTINUE",
                         "HadoopJarStep": {
                           "Jar": "command-runner.jar",
                           "Args": ["spark-submit", "--executor-memory", executor_memory, "--executor-cores",
                                     executor_cores, "--num-executors",
                                     num_executors, "--driver-memory",
                                     driver_memory, "--driver-cores", driver_cores,
                                     "--conf",
                                     "spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4",
                                     "--conf",
                                     "spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4",
                                     "--deploy-mode", "cluster", "--class", "Main",
                                     "--jars", workflow_jar, prophecy_libs_jar,
                                     "-C", "A=B", "-C",
                                     "fabricName=" + config.fabric]
                         }
                       }]
        step_adder = EmrAddStepsOperator(
            task_id = "Workflow_0",
            job_flow_id = job_flow_id,
            aws_conn_id = "aws_default_pankaj",
            steps = spark_steps,
            trigger_rule = "all_success"
        )
        step_checker = EmrStepSensor(
            task_id = "Workflow_0WatchSteps",
            job_flow_id = job_flow_id,
            step_id = "{{ task_instance.xcom_pull(task_ids='Workflow_0', key='return_value')[0] }}",
            aws_conn_id = "aws_default_pankaj"
        )
        step_adder >> step_checker

        return step_adder, step_checker
예제 #8
0
        dagrun_timeout=timedelta(hours=2),
        start_date=datetime(2021, 1, 1),
        schedule_interval='0 3 * * *',
        catchup=False,
        tags=['example'],
) as dag:

    # [START howto_operator_emr_manual_steps_tasks]
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=cluster_creator.output,
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=cluster_creator.output,
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
    )

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster', job_flow_id=cluster_creator.output)

    step_adder >> step_checker >> cluster_remover
    # [END howto_operator_emr_manual_steps_tasks]
    description='DAG test',
    schedule_interval=timedelta(days=1),
)

create_emr_cluster = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    region_name='us-east-2',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    dag=dag)

emr_step_1 = EmrAddStepsOperator(
    task_id='emr_step1',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=step1,
    dag=dag)

emr_step_2 = EmrAddStepsOperator(
    task_id='emr_step2',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=step2,
    dag=dag)

emr_step_sensor = EmrStepSensor(
    task_id='watch_step',
    job_flow_id=
예제 #10
0
        op_kwargs={
            "aws_conn": "my_aws_conn",
            "bucket": "my_bucket",
            "prefix": "file_name"
        },
        provide_context=False)

    create_job_flow_task = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        aws_conn_id='aws_default',
        emr_conn_id='emr_default',
        job_flow_overrides=cluster_conf)

    add_step_task = EmrAddStepsOperator(
        task_id='My_first_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='my_aws_conn',
        steps=my_first_job)

    watch_prev_step_task = EmrStepSensor(
        task_id='watch_prev_step',
        job_flow_id=
        "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
        step_id=
        "{{task_instance.xcom_pull(task_ids='My_first_job', key='return_value')}}",
        aws_conn_id='aws_default')

    terminate_job_flow_task = EmrTerminateJobFlowOperator(
        task_id='terminate_job_flow',
        job_flow_id=
        "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
예제 #11
0
        'ActionOnFailure': action,
        'HadoopJarStep': {
            'Jar': 's3://ap-south-1.elasticmapreduce/libs/script-runner/script-runner.jar',
            'Args': [
                '/usr/bin/aws', 'emr', 'modify-instance-fleet', '--cluster-id', '{}'.format(cluster_id),
                '--instance-fleet', 'InstanceFleetId={},TargetOnDemandCapacity={},TargetSpotCapacity={}'.format(
                    instance_fleet_id, target_on_demand_capacity, target_spot_capacity)
            ]
        }
    }]


step_1 = EmrAddStepsOperator(
    task_id='step_1',
    aws_conn_id='aws_default',
    steps=add_scala_steps('jar-location',
                          'class-name', 'CONTINUE',
                          "{{ dag_run.conf['date'] }}"),
    dag=dag
)

sensor_1 = EmrStepSensor(
    task_id='sensor_1',
    job_flow_id="{{ dag_run.conf['cluster_id'] }}",
    step_id="{{ task_instance.xcom_pull('step_1', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

step_2 = EmrAddStepsOperator(
    task_id='step_2',
    aws_conn_id='aws_default',
예제 #12
0
    },
    start_date=datetime(2021, 1, 1, 0, 0),
    schedule_interval="@daily",
    catchup=False,
) as dag:

    create_cluster = EmrCreateJobFlowOperator(
        task_id="create_cluster",
        job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES,
        aws_conn_id="aws_default",
        emr_conn_id="emr_default",
    )

    add_step_load_raw_data = EmrAddStepsOperator(
        task_id="add_step_load_raw_data",
        job_flow_id=create_cluster.output,
        aws_conn_id="aws_default",
        steps=SparkSteps.LOAD_RAW_DATA,
    )

    wait_for_step_load_raw_data = EmrStepSensor(
        task_id="wait_for_step_load_raw_data",
        job_flow_id=create_cluster.output,
        step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}",
        aws_conn_id="aws_default",
    )

    add_step_transform = EmrAddStepsOperator(
        task_id="add_step_transform",
        job_flow_id=create_cluster.output,
        aws_conn_id="aws_default",
        steps=SparkSteps.TRANSFORM,
        tags=["emr demo", "spark", "pyspark"],
) as dag:
    begin = DummyOperator(task_id="begin")

    end = DummyOperator(task_id="end")

    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow",
        job_flow_overrides=get_object(
            "job_flow_overrides/job_flow_overrides.json", work_bucket),
    )

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=get_object("emr_steps/emr_steps.json", work_bucket),
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id="aws_default",
    )

    begin >> cluster_creator >> step_adder >> step_checker >> end
예제 #14
0
            "spark.yarn.maxAppAttempts=1",
            "--class",
            "com.meta.mr.multikey.partner.PartnerStageOne",
            "{{ dag_run.conf['pidMrMultikeyJarPath'] }}",
            "s3://{{ dag_run.conf['metaBucketName'] }}/{{ dag_run.conf['instanceId'] }}",
            "s3://{{ dag_run.conf['advBucketName'] }}/{{ dag_run.conf['instanceId'] }}",
            "{{ dag_run.conf['outputPath'] }}",
            "{{ dag_run.conf['inputPath'] }}",
        ],
    },
}]

stage1_adder = EmrAddStepsOperator(
    task_id="add_stage_1",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=SPARK_STEP_1,
    dag=dag,
)

stage1_checker = EmrStepSensor(
    task_id="watch_stage1",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='add_stage_1', key='return_value')[0] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

sensor_stage2_key = S3KeySensor(
예제 #15
0
class TestEmrAddStepsOperator(unittest.TestCase):
    # When
    _config = [{
        'Name': 'test_step',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar': 'command-runner.jar',
            'Args': [
                '/usr/lib/spark/bin/run-example',
                '{{ macros.ds_add(ds, -1) }}',
                '{{ ds }}'
            ]
        }
    }]

    def setUp(self):
        self.args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        self.mock_context = MagicMock()

        self.operator = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps=self._config,
            dag=DAG('test_dag_id', default_args=self.args)
        )

    def test_init(self):
        self.assertEqual(self.operator.job_flow_id, 'j-8989898989')
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
                    DEFAULT_DATE.strftime("%Y-%m-%d"),
                ]
            }
        }]

        self.assertListEqual(self.operator.steps, expected_args)

    def test_render_template_2(self):
        dag = DAG(
            dag_id='test_xcom', default_args=self.args)

        xcom_steps = [
            {
                'Name': 'test_step1',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example1'
                    ]
                }
            }, {
                'Name': 'test_step2',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example2'
                    ]
                }
            }
        ]

        make_steps = DummyOperator(task_id='make_steps', dag=dag, owner='airflow')
        execution_date = timezone.utcnow()
        ti1 = TaskInstance(task=make_steps, execution_date=execution_date)
        ti1.xcom_push(key='steps', value=xcom_steps)

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        test_task = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps="{{ ti.xcom_pull(task_ids='make_steps',key='steps') }}",
            dag=dag)

        with patch('boto3.session.Session', self.boto3_session_mock):
            ti = TaskInstance(task=test_task, execution_date=execution_date)
            ti.run()

        self.emr_client_mock.add_job_flow_steps.assert_called_once_with(
            JobFlowId='j-8989898989',
            Steps=xcom_steps)

    def test_execute_returns_step_id(self):
        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(self.mock_context), ['s-2LH3R5GW3A53T'])

    def test_init_with_cluster_name(self):
        expected_job_flow_id = 'j-1231231234'

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \
                    as mock_get_cluster_id_by_name:
                mock_get_cluster_id_by_name.return_value = expected_job_flow_id

                operator = EmrAddStepsOperator(
                    task_id='test_task',
                    job_flow_name='test_cluster',
                    cluster_states=['RUNNING', 'WAITING'],
                    aws_conn_id='aws_default',
                    dag=DAG('test_dag_id', default_args=self.args)
                )

                operator.execute(self.mock_context)

        ti = self.mock_context['ti']

        ti.xcom_push.assert_called_once_with(key='job_flow_id', value=expected_job_flow_id)

    def test_init_with_nonexistent_cluster_name(self):
        cluster_name = 'test_cluster'

        with patch('airflow.providers.amazon.aws.hooks.emr.EmrHook.get_cluster_id_by_name') \
                as mock_get_cluster_id_by_name:
            mock_get_cluster_id_by_name.return_value = None

            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_name=cluster_name,
                cluster_states=['RUNNING', 'WAITING'],
                aws_conn_id='aws_default',
                dag=DAG('test_dag_id', default_args=self.args)
            )

            with self.assertRaises(AirflowException) as error:
                operator.execute(self.mock_context)
            self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
예제 #16
0
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        schedule_interval='0 3 * * *',
        tags=['example'],
) as dag:

    # [START howto_operator_emr_manual_steps_tasks]
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEPS)

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
예제 #17
0
                op_kwargs={
                    'name': file['spark_step_args']['name'],
                    'python_dependencies': file['spark_step_args']['python_dependencies'],
                    'jars': file['spark_step_args']['jars'],
                    's3_input': file['spark_step_args']['s3_input'],
                    's3_script': file['spark_step_args']['s3_script'],
                    's3_output': file['spark_step_args']['s3_output'],
                    'db_table': file['spark_step_args']['db_table']
                }
            )
            entity_task_list.append(render_spark_step)

            # add spark step to emr
            add_step = EmrAddStepsOperator(
                task_id='add_step_{}'.format(key),
                job_flow_id=job_flow_id,
                aws_conn_id='aws_default',
                steps="{{ task_instance.xcom_pull(task_ids='" + 'render_spark_step_{}'.format(key) + "', key='return_value') }}",
            )
            entity_task_list.append(add_step)

            # wait for the step to complete
            watch_step = EmrStepSensor(
                task_id='watch_step_{}'.format(key),
                job_flow_id=job_flow_id,
                step_id="{{ task_instance.xcom_pull(task_ids='" + 'add_step_{}'.format(key) + "', key='return_value')[0] }}",
                aws_conn_id='aws_default'
            )
            entity_task_list.append(watch_step)

        # if file has sql quality steps, then create additional tasks in DAG
        if file['sql_step_args']: